diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index e29881fcbac0175b7cbcd93c82fbecd8d9d59b59..68aff793ae6aa55ac1eedc130446c0b5e7046f2a 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 270663c415c7206b82bde00377da3f45ecc08b70..7045d8810493e5c79d670a11401b99cf16268a2e 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py
index 769d2efda4adc494cd9e78074b30b4a721cb279a..c0d60dd5328f454b91dda36b216a09696671b0bc 100644
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import pytest
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 409a6ca82008243ace99a3c4dc735305cdb1730c..930adfaf3e192febfa09cd34b8cb8f0e05d2dd7a 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index d3f5fc5cd4cee6a40d2d2a3fcb4677259f173b23..72c52d5bb5e9ba8fe9b9602862ed2a9b20aa9ab0 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
 
 ### Visualizing the results
 
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 7f2a2d8dc2969275bd0739da8dd8c976ad728c42..a4f1638c1adb8db7336960f9e227b23b054181a8 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index 778a3a8d87f63f3aba83d21d2a36b8159e7f81b9..8532ff7ef798cc039926ef8781fccab71f5e17a9 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 10a7a2f5a467e7ba9fefc08914a4af033b89b163..053fd52c35ae906710cbe08579becf0eafc72c4b 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
index e5f179a0f5b68b4f684869bc652827f69d6266ef..ddea1d2b1b1ed5710f5f878c663e83f388a62b73 100644
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from lmdeploy.serve.openai.api_client import APIClient
 
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 2a7b37991f31a0e4a553e3f3b300b4cc37d19da4..fb3b9d5e34e03c7708f560301a7b802f3e3906bc 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
 import json
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index b3c27e2c99c2b2c2741871bd4276cb6929139ef1..16b5ad0297fe79ced4bbb25f7a1ced0c58425c22 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -11,6 +12,7 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -28,6 +30,7 @@ steps:
 
   - label: "Build wheel - CUDA 11.8"
     # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -44,6 +47,7 @@ steps:
 
   - label: "Build release image"
     depends_on: block-release-image-build
+    id: build-release-image
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -51,6 +55,18 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
   - label: "Build and publish TPU release image"
     depends_on: ~
     if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - input: "Provide Release version here"
+    id: input-release-version
     fields:
       - text: "What is the release version?"
-        key: "release-version"
+        key: release-version
 
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
new file mode 100755
index 0000000000000000000000000000000000000000..94e0ac2398f34ecf5f5b1c957a0373d6a61c9218
--- /dev/null
+++ b/.buildkite/scripts/annotate-release.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
\ No newline at end of file
diff --git a/.buildkite/scripts/ci-clean-log.sh b/.buildkite/scripts/ci-clean-log.sh
new file mode 100644
index 0000000000000000000000000000000000000000..69d8a3a288316b547a85e39caec5021d15fca563
--- /dev/null
+++ b/.buildkite/scripts/ci-clean-log.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index bbc896ec68190b5b05b47be6d0a6c8e1c4d8ef7d..6e9af1e721bb70cdf38914ea32f48156bbb7248f 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
   commands="${commands} \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 077bd9914907945d5a99f964eda7377a3ed71294..36bcb015d308ebf096662060c4e9726db10e0927 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
   if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
       podman rm -f "$container_id" || true
   fi
   podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 40f3df96065d184cad52e2e59625add4750c7587..bbcde4009c0eb9e9f30ed14bb4b7cfc7a25ea0a2 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -6,72 +6,70 @@ set -ex
 
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
 # Setup cleanup
 remove_docker_container() { 
     set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
-  export BUILDKITE_BUILD_NUMBER=$3
 
   # offline inference
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/models/language/generation -m cpu_model
+    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    VLLM_USE_V1=0 pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online serving
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \
@@ -83,7 +81,7 @@ function cpu_tests() {
       --tokenizer facebook/opt-125m"
 
   # Run multi-lora tests
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/lora/test_qwen2vl.py"
@@ -91,4 +89,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 2d375d7e9d8711502bfc104737972ee78fc482c3..a2a5c2a02cbb9776eda2a42fa4232861bb82896c 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -2,102 +2,184 @@
 
 set -xu
 
-# Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+}
+
 trap remove_docker_container EXIT
+
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+cleanup_docker
+
 # For HF_TOKEN.
 source /etc/environment
-# Run a simple end-to-end example.
+
 docker run --privileged --net host --shm-size=16G -it \
     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest pytest-asyncio tpu-info \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && export VLLM_XLA_CACHE_PATH= \
-    && export VLLM_USE_V1=1 \
-    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo HARDWARE \
-    && tpu-info \
-    && { \
-        echo TEST_0: Running test_perf.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
-        echo TEST_0_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_1: Running test_compilation.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
-        echo TEST_1_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_2: Running test_basic.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
-        echo TEST_2_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        echo TEST_3_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_4: Running test_quantization_accuracy.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
-        echo TEST_4_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_5: Running examples/offline_inference/tpu.py; \
-        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
-        echo TEST_5_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_6: Running test_tpu_model_runner.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
-        echo TEST_6_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_7: Running test_sampler.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
-        echo TEST_7_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_8: Running test_topk_topp_sampler.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
-        echo TEST_8_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_9: Running test_multimodal.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
-        echo TEST_9_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_10: Running test_pallas.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
-        echo TEST_10_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_11: Running test_struct_output_generate.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
-        echo TEST_11_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_12: Running test_moe_pallas.py; \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
-        echo TEST_12_EXIT_CODE: \$?; \
-    } & \
-    # Disable the TPU LoRA tests until the feature is activated
-    # & { \
-    #     echo TEST_13: Running test_moe_pallas.py; \
-    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
-    #     echo TEST_13_EXIT_CODE: \$?; \
-    # } & \
-    wait \
-    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
-"
+    vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
+echo "--- Python dependencies installed ---"
+export VLLM_USE_V1=1
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"
+
+echo "--- Hardware Information ---"
+tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+    local test_num=$1
+    local test_name=$2
+    local test_command=$3
+    local log_file="$RESULTS_DIR/test_${test_num}.log"
+    local actual_exit_code
+
+    echo "--- TEST_$test_num: Running $test_name ---"
+    
+    # Execute the test command.
+    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+    actual_exit_code=$?
+
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+    if [ "$actual_exit_code" -ne 0 ]; then
+        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+        if [ -f "$log_file" ]; then
+            cat "$log_file" >&2
+        else
+            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+        fi
+        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+        return "$actual_exit_code" # Return the failure code
+    else
+        echo "TEST_$test_num ($test_name) PASSED."
+        return 0 # Return success
+    fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+    local test_num_arg="$1"
+    local test_name_arg="$2"
+    local test_command_arg="$3"
+
+    # Run the test
+    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+    local test_specific_exit_code=$?
+
+    # If the test failed, set the overall script exit code to 1
+    if [ "$test_specific_exit_code" -ne 0 ]; then
+        # No need for extra echo here, run_test already logged the failure.
+        overall_script_exit_code=1
+    fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 0 "test_perf.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
+run_and_track_test 1 "test_compilation.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
+run_and_track_test 2 "test_basic.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
+run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+run_and_track_test 4 "test_quantization_accuracy.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
+run_and_track_test 5 "examples/offline_inference/tpu.py" \
+    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
+run_and_track_test 6 "test_tpu_model_runner.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
+run_and_track_test 7 "test_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
+run_and_track_test 8 "test_topk_topp_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
+run_and_track_test 9 "test_multimodal.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
+run_and_track_test 10 "test_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
+run_and_track_test 11 "test_struct_output_generate.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 12 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 13 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
 
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+    exit "$DOCKER_RUN_EXIT_CODE"
+else
+    echo "Docker run command completed successfully."
+    exit 0
+fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
-# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/.buildkite/scripts/rerun-test.sh b/.buildkite/scripts/rerun-test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d79c0d5f381b149c2cf14c3b14afc94820e00d70
--- /dev/null
+++ b/.buildkite/scripts/rerun-test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh
new file mode 100755
index 0000000000000000000000000000000000000000..209d9c4341cdd83033a92fb878ceb8e6b13d5298
--- /dev/null
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env
new file mode 100644
index 0000000000000000000000000000000000000000..44175864734746458fd1c0c69bf3c5d907f71848
--- /dev/null
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6705da03e3d761baf4d8849bba5444cc0e9c7c6b
--- /dev/null
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+if [ ! -f "$1" ]; then
+  echo "Error: The env file '$1' does not exist."
+  exit 1  # Exit the script with a non-zero status to indicate an error
+fi
+
+ENV_FILE=$1
+
+# For testing on local vm, use `set -a` to export all variables
+source /etc/environment
+source $ENV_FILE
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+    docker rm -f $CONTAINER_NAME || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build docker image.
+# TODO: build the image outside the script and share the image with other
+# tpu test if building time is too long.
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg GIT_REPO_CHECK=0 \
+  --tag vllm/vllm-tpu-bm \
+  --progress plain -f docker/Dockerfile.tpu .
+
+LOG_ROOT=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $LOG_ROOT"
+
+if [ -z "$HF_TOKEN" ]; then
+  echo "Error: HF_TOKEN is not set or is empty."  
+  exit 1
+fi
+
+# Make sure mounted disk or dir exists
+if [ ! -d "$DOWNLOAD_DIR" ]; then
+    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
+    exit 1
+fi
+
+echo "Run model $MODEL"
+echo
+
+echo "starting docker...$CONTAINER_NAME"
+echo    
+docker run \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
+ -e WORKSPACE=/workspace \
+ --name $CONTAINER_NAME \
+ -d \
+ --privileged \
+ --network host \
+ -v /dev/shm:/dev/shm \
+ vllm/vllm-tpu-bm tail -f /dev/null
+
+echo "run script..."
+echo
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
+
+echo "copy result back..."
+VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
+BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
+docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
+docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
+
+if [ "$BUILDKITE" = "true" ]; then
+  echo "Running inside Buildkite"
+  buildkite-agent artifact upload "$VLLM_LOG" 
+  buildkite-agent artifact upload "$BM_LOG"
+else
+  echo "Not running inside Buildkite"
+fi
+
+#
+# compare the throughput with EXPECTED_THROUGHPUT 
+# and assert meeting the expectation
+# 
+if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  echo "Failed to get the throughput"
+  exit 1
+fi
+
+if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
+  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
+  exit 1
+fi
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
new file mode 100755
index 0000000000000000000000000000000000000000..877669cd956ac5f5977d91ac3e0f645e73e5cd05
--- /dev/null
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -euo pipefail
+
+VLLM_LOG="$WORKSPACE/vllm_log.txt"
+BM_LOG="$WORKSPACE/bm_log.txt"
+
+if [ -n "$TARGET_COMMIT" ]; then
+  head_hash=$(git rev-parse HEAD)
+  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
+    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
+    exit 1
+  fi
+fi
+
+echo "model: $MODEL"
+echo
+
+#
+# create a log folder
+#
+mkdir "$WORKSPACE/log"
+
+# TODO: Move to image building.
+pip install pandas
+pip install datasets
+
+#
+# create sonnet_4x
+#
+echo "Create sonnet_4x.txt"
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+ do
+  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+#
+# start vllm service in backend
+#
+echo "lanching vllm..."
+echo "logging to $VLLM_LOG"
+echo
+
+VLLM_USE_V1=1 vllm serve $MODEL \
+ --seed 42 \
+ --disable-log-requests \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --no-enable-prefix-caching \
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+
+
+echo "wait for 20 minutes.."
+echo
+# sleep 1200
+# wait for 10 minutes...
+for i in {1..120}; do
+    # TODO: detect other type of errors.
+    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
+        echo "Detected RuntimeError, exiting."
+        exit 1
+    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
+        echo "Application started"
+        break
+    else
+        echo "wait for 10 seconds..."
+        sleep 10
+    fi
+done
+
+#
+# run test
+#
+echo "run benchmark test..."
+echo "logging to $BM_LOG"
+echo
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model $MODEL  \
+    --dataset-name sonnet \
+    --dataset-path benchmarks/sonnet_4x.txt \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
+    --ignore-eos > "$BM_LOG"
+
+echo "completed..."
+echo
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput: $throughput"
+echo
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 80a5a610c8ac99b9814c55a6c0924569f340b34e..b739851cb90528b0f1b7feab14b14a6c4ded0802 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -145,6 +145,7 @@ steps:
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/engine/test_engine_core_client.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -154,6 +155,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -199,8 +201,9 @@ steps:
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
+  - tests/test_vllm_port
   commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
@@ -274,17 +277,6 @@ steps:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
-- label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/model_executor/guided_decoding
-  - tests/test_logits_processor
-  - tests/model_executor/test_guided_processors
-  commands:
-    - pytest -v -s test_logits_processor.py
-    - pytest -v -s model_executor/test_guided_processors.py
-
 - label: Speculative decoding tests # 40min
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
@@ -297,7 +289,7 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
@@ -328,6 +320,7 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/piecewise/test_full_cudagraph.py
 
 - label: PyTorch Fullgraph Test # 18min
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -397,6 +390,17 @@ steps:
     - pytest -v -s tensorizer_loader
     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
+- label: Model Executor Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/model_executor
+  - tests/model_executor
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+
 - label: Benchmarks # 9min
   mirror_hardwares: [amdexperimental, amdproduction]
   working_dir: "/vllm-workspace/.buildkite"
@@ -420,6 +424,9 @@ steps:
   - vllm/model_executor/layers/quantization
   - tests/quantization
   commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
@@ -617,9 +624,11 @@ steps:
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
   - vllm/v1/engine/
   commands:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 4452ce22d504ed1336c81ecaf0d64365962a06ea..e98ccd035ee90997f72cb1ba748b34e6e54fcdac 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,15 +10,17 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb
+/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm
+/vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb
+/vllm/v1/structured_output @mgoin @russellb @aarnphm
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
@@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
-/tests/v1/structured_output @mgoin @russellb
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee
 
 # Docs
 /docs @hmellor
-mkdocs.yaml @hmellor
\ No newline at end of file
+mkdocs.yaml @hmellor
diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index f05be2ba8707afcffdc89759263137e8d6bc8d72..8c5c28cd77cff594196100f00b22e3c0220dbc09 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -8,6 +8,16 @@ body:
   attributes:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
+      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
+      - Passwords or authentication credentials
+      - Private URLs or endpoints
+      - Personal or confidential data
+      
+      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
   attributes:
     label: Your current environment
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 65be771b94fb9a2791d7e61099248e7e532c8a9e..017ec7ca82da78302a551e3d10408bb1d2ceb41e 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,18 @@
-FILL IN THE PR DESCRIPTION HERE
+## Essential Elements of an Effective PR Description Checklist
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 
-FIX #xxxx (*link existing issues this PR will resolve*)
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+
+## Purpose
+
+## Test Plan
+
+## Test Result
+
+## (Optional) Documentation Update
 
 <!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/mergify.yml b/.github/mergify.yml
index e595060c325a5fe128738eb22dbff13fa383baa3..5692bb5d363d8013e09dd26c9e1534dfd9871a5a 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -36,6 +36,20 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-llama
+  description: Automatically apply llama label
+  conditions:
+    - or:
+      - files~=^examples/.*llama.*\.py
+      - files~=^tests/.*llama.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
+      - files~=^vllm/model_executor/models/.*llama.*\.py
+      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+  actions:
+    label:
+      add:
+        - llama
+
 - name: label-multi-modality
   description: Automatically apply multi-modality label
   conditions:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b45619a3234cf490018ef46e5acfaeb0dda6da04..a105b0e14c4aff2330af3139cbcc050cb01b2bcc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,6 +11,8 @@ repos:
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.11.7
   hooks:
@@ -58,7 +60,7 @@ repos:
     entry: tools/mypy.sh 0 "local"
     language: python
     types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
     stages: [pre-commit] # Don't run in CI
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d222f84c9044fedf2c569663e625e5c1bdb9079..dbf0ca291dfb8acb28ea4d9542e9cc10dce191de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -179,9 +182,6 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
-#
-# Set rocm version dev int.
-#
 if(VLLM_GPU_LANG STREQUAL "HIP")
   #
   # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@@ -189,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
 
-
   #
   # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
   # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@@ -244,6 +243,7 @@ set(VLLM_EXT_SRC
   "csrc/layernorm_kernels.cu"
   "csrc/opt/transpose_kernels.cu"
   # "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
   "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@@ -309,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -455,7 +455,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # kernels for the remaining archs that are not already built for 3x.
   # (Build 8.9 for FP8)
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -544,8 +544,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUTLASS MoE kernels
 
   # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
@@ -685,7 +685,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
@@ -788,5 +788,7 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
+
+    # vllm-flash-attn should be last as it overwrites some CMake functions
     include(cmake/external_projects/vllm_flash_attn.cmake)
 endif ()
\ No newline at end of file
diff --git a/README.md b/README.md
index 67f6b957ec55a4744d5b1447345c3f2edcbea590..ec16d758327d4ecde377c9e96f9fdc49fb4da705 100644
--- a/README.md
+++ b/README.md
@@ -58,8 +58,8 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
-- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
-- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
 - Speculative decoding
 - Chunked prefill
 
@@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
 - Prefix caching support
 - Multi-LoRA support
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
-- Embedding Models (e.g. E5-Mistral)
+- Embedding Models (e.g., E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
 
 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Media Kit
 
-- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
diff --git a/SECURITY.md b/SECURITY.md
index 47196a1f1221e216d90e23353f73371cf0b52773..6053cfb41f35b2a9511c2a64f68f7f0c15ee9c6f 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
 
 ---
 
+Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
+
 Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index ecab570bb31c4107dc1d022963abde2a71e15299..6f9fbb91cbd9110a36c7a6708c2a8d6cf3a50a35 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -64,6 +64,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
     </tr>
+    <tr>
+      <td><strong>Custom</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>data.jsonl</code></td>
+    </tr>
   </tbody>
 </table>
 
@@ -124,6 +130,38 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
 
+### Custom Dataset
+If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
+
+```
+{"prompt": "What is the capital of India?"}
+{"prompt": "What is the capital of Iran?"}
+{"prompt": "What is the capital of China?"}
+``` 
+
+```bash
+# start server
+VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+```
+
+```bash
+# run benchmarking script
+python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+  --backend vllm \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --endpoint /v1/completions \
+  --dataset-name custom \
+  --dataset-path <path-to-your-data-jsonl> \
+  --custom-skip-chat-template \
+  --num-prompts 80 \
+  --max-concurrency 1 \
+  --temperature=0.3 \
+  --top-p=0.75 \
+  --result-dir "./log/"
+```
+
+You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
+
 ### VisionArena Benchmark for Vision Language Models
 
 ```bash
@@ -146,9 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
 
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --ngram_prompt_lookup_min 2 \
-    --ngram-prompt-lookup-max 5 \
-    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
 ```
 
 ``` bash
@@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \
     --seed 42
 ```
 
+**`philschmid/mt-bench`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path philschmid/mt-bench \
+    --num-prompts 80
+```
+
 ### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
@@ -273,9 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
     --output-len=100 \
     --num-prompts=2048 \
     --async-engine \
-    --ngram_prompt_lookup_min=2 \
-    --ngram-prompt-lookup-max=5 \
-    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
 ```
 
 ```
diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh
index ea63c6f71a6c50ae698b0c9969d38da91896f728..1b01bbd61b628f0ed18041b8945d2e5f086951a4 100644
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@@ -10,11 +10,15 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
+#   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
 #   OUTPUT_LEN: request output len
 #   MIN_CACHE_HIT_PCT: prefix cache rate
 #   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
+#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
+#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
 # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
 # 5. The final result will be saved in RESULT file. 
 
@@ -30,31 +34,27 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
+TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
-MIN_CACHE_HIT_PCT_PCT=0
+MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
+NUM_SEQS_LIST="128 256"
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 
-echo "result file$ $RESULT"
+echo "result file: $RESULT"
 echo "model: $MODEL"
-echo
 
 rm -rf $LOG_FOLDER
 mkdir -p $LOG_FOLDER
 
 cd "$BASE/vllm"
-# create sonnet-4x.txt so that we can sample 2048 tokens for input
-echo "" > benchmarks/sonnet_4x.txt
-for _ in {1..4}
-do
-cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
-done
 
-pip install datasets
+pip install -q datasets
 
 current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
@@ -64,53 +64,69 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
-run_benchmark() {
-    local max_num_seqs=$1
-    local max_num_batched_tokens=$2
-    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
-    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
-    echo "vllm_log: $vllm_log"
-    echo
-    rm -f $vllm_log
 
-    # start the server
+start_server() {
+    local gpu_memory_utilization=$1
+    local max_num_seqs=$2
+    local max_num_batched_tokens=$3
+    local vllm_log=$4
+    
+    pkill -f vllm
+
     VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
         --disable-log-requests \
         --port 8004 \
-        --gpu-memory-utilization 0.98 \
+        --gpu-memory-utilization $gpu_memory_utilization \
         --max-num-seqs $max_num_seqs \
         --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size 1 \
+        --tensor-parallel-size $TP \
         --enable-prefix-caching \
         --load-format dummy \
-        --download-dir $DOWNLOAD_DIR \
+        --download-dir "$DOWNLOAD_DIR" \
         --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
-    echo "wait for 10 minutes.."
-    echo
+
     # wait for 10 minutes...
     server_started=0
-    for i in {1..60}; do        
-        if grep -Fq "Application startup complete" "$vllm_log"; then
-            echo "Application started"
+    for i in {1..60}; do  
+        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
             break
         else
-            # echo "wait for 10 seconds..."
             sleep 10
         fi
     done
- 
     if (( ! server_started )); then
-        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
-        echo "pkill -f vllm"
-        echo
-        pkill vllm
-        sleep 10
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
         return 1
+    else
+        return 0
     fi
+}
+
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    local gpu_memory_utilization=$3
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+    pkill -f vllm
+
+    echo "starting server..."
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
+    result=$?
+    if [[ "$result" -eq 1 ]]; then
+        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    else
+        echo "server started."
+    fi
+    echo
     
     echo "run benchmark test..."
-    echo
     meet_latency_requirement=0
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@@ -118,29 +134,29 @@ run_benchmark() {
     python benchmarks/benchmark_serving.py \
         --backend vllm \
         --model $MODEL  \
-        --dataset-name sonnet \
-        --dataset-path benchmarks/sonnet_4x.txt \
-        --sonnet-input-len $INPUT_LEN \
-        --sonnet-output-len $OUTPUT_LEN \
+        --dataset-name random \
+        --random-input-len $INPUT_LEN \
+        --random-output-len $OUTPUT_LEN \
         --ignore-eos \
         --disable-tqdm \
         --request-rate inf \
         --percentile-metrics ttft,tpot,itl,e2el \
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 100 \
-        --sonnet-prefix-len $prefix_len \
-        --port 8004 > "$bm_log"
-    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+        --num-prompts 1000 \
+        --random-prefix-len $prefix_len \
+        --port 8004 &> "$bm_log"
+    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
 
     if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
         meet_latency_requirement=1
+        request_rate=inf
     fi
 
     if (( ! meet_latency_requirement )); then
-    # start from request-rate as int(through_put) + 1
-        request_rate=$((${through_put%.*} + 1))
+    # start from request-rate as int(throughput) + 1
+        request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
             # clear prefix cache
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@@ -149,19 +165,18 @@ run_benchmark() {
             python benchmarks/benchmark_serving.py \
                 --backend vllm \
                 --model $MODEL  \
-                --dataset-name sonnet \
-                --dataset-path benchmarks/sonnet_4x.txt \
-                --sonnet-input-len $INPUT_LEN \
-                --sonnet-output-len $OUTPUT_LEN \
-                --ignore_eos \
+                --dataset-name random \
+                --random-input-len $INPUT_LEN \
+                --random-output-len $OUTPUT_LEN \
+                --ignore-eos \
                 --disable-tqdm \
                 --request-rate $request_rate \
                 --percentile-metrics ttft,tpot,itl,e2el \
                 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                 --num-prompts 100 \
-                --sonnet-prefix-len $prefix_len \
-                --port 8004 > "$bm_log"
-            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+                --random-prefix-len $prefix_len \
+                --port 8004 &> "$bm_log"
+            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
             goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@@ -173,10 +188,10 @@ run_benchmark() {
     fi
     # write the results and update the best result.
     if ((meet_latency_requirement)); then
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
-        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
-            best_throughput=$through_put
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
+            best_throughput=$throughput
             best_max_num_seqs=$max_num_seqs
             best_num_batched_tokens=$max_num_batched_tokens
             best_goodput=$goodput
@@ -188,22 +203,39 @@ run_benchmark() {
 
     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 
-    echo "pkill -f vllm"
-    echo
     pkill vllm
     sleep 10
-    rm -f $vllm_log
     printf '=%.0s' $(seq 1 20)
     return 0
 }
 
+read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
+read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
+
+# first find out the max gpu-memory-utilization without HBM OOM.
+gpu_memory_utilization=0.98
+find_gpu_memory_utilization=0
+while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    result=$?
+    if [[ "$result" -eq 0 ]]; then
+        find_gpu_memory_utilization=1
+        break
+    else
+        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
+    fi
+done
+
+if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
+    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
+else
+    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
+    exit 1
+fi
 
-num_seqs_list="128 256"
-num_batched_tokens_list="512 1024 2048 4096"
-for num_seqs in $num_seqs_list; do
-    for num_batched_tokens in $num_batched_tokens_list; do
-        run_benchmark $num_seqs $num_batched_tokens
-        exit 0
+for num_seqs in "${num_seqs_list[@]}"; do
+    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
     done
 done
 echo "finish permutations"
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 88616e1108c52c51cf03f9ee211c347beb40767a..ddb38e304cd6565f5b5369c9555633fec7dfa373 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
 import json
@@ -324,7 +325,7 @@ async def async_request_openai_completions(
 
                                 most_recent_timestamp = timestamp
                                 generated_text += text or ""
-                            elif usage := data.get("usage"):
+                            if usage := data.get("usage"):
                                 output.output_tokens = usage.get("completion_tokens")
                     if first_chunk_received:
                         output.success = True
@@ -611,6 +612,7 @@ ASYNC_REQUEST_FUNCS = {
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 5513a5f78f1ce219be5462d08cb03d7a6e2bebde..5d2a26cd443c0060eb5ed80ed44565812caa96c6 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
@@ -9,9 +10,6 @@ generation. Supported dataset types include:
   - BurstGPT
   - HuggingFace
   - VisionArena
-
-TODO: Implement CustomDataset to parse a JSON file and convert its contents into
-SampleRequest instances, similar to the approach used in ShareGPT.
 """
 
 import base64
@@ -442,6 +440,97 @@ class ShareGPTDataset(BenchmarkDataset):
         return samples
 
 
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset."
+            )
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -776,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
                 SampleRequest(
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 84759c5c354dc7754b09cf56286357af98dc6713..c06857247eeed9bbb61c437ff144d3bd9edba4b1 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""
 
 import argparse
@@ -6,13 +7,12 @@ import dataclasses
 import json
 import os
 import time
-from pathlib import Path
 from typing import Any, Optional
 
 import numpy as np
-import torch
 from tqdm import tqdm
 
+import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -80,17 +80,9 @@ def main(args: argparse.Namespace):
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    str(profile_dir)
-                ),
-            ) as p:
-                llm_generate()
-            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
         else:
             start_time = time.perf_counter()
             llm_generate()
@@ -103,11 +95,7 @@ def main(args: argparse.Namespace):
         run_to_completion(profile_dir=None)
 
     if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = (
-                Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-            )
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
@@ -164,15 +152,6 @@ if __name__ == "__main__":
         action="store_true",
         help="profile the generation process of a single batch",
     )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default=None,
-        help=(
-            "path to save the pytorch profiler output. Can be visualized "
-            "with ui.perfetto.dev or Tensorboard."
-        ),
-    )
     parser.add_argument(
         "--output-json",
         type=str,
@@ -193,4 +172,9 @@ if __name__ == "__main__":
     # numbers. We need to disable prefix caching by default.
     parser.set_defaults(enable_prefix_caching=False)
     args = parser.parse_args()
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler."
+        )
     main(args)
diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 109624c877891c87d042abdbc5785e61b97a0924..00869fa94e71a78c8ec26f8d35d1ff67871584ee 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Offline benchmark to test the long document QA throughput.
 
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index ffaa8035797c10accebde3ab6e35356bd6db6292..3e4704f0b8205870914a0c1d7ffe100ece91e6ee 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the efficiency of prefix caching.
 
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index a05dd24dece83d5c7a12130f56d4a82d5eb377f4..5496703f23ccbe368f4b8dbc20da5ad308109996 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline prioritization."""
 
 import argparse
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a887e7150dc78ad1f5ca03951b5482d170a36f54..81428fb7dae12a9d4a1f6a0755f9bff25e28a585 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
@@ -60,6 +61,7 @@ from benchmark_dataset import (
     ASRDataset,
     BurstGPTDataset,
     ConversationDataset,
+    CustomDataset,
     HuggingFaceDataset,
     InstructCoderDataset,
     MTBenchDataset,
@@ -627,7 +629,16 @@ def main(args: argparse.Namespace):
             "'--dataset-path' if required."
         )
 
-    if args.dataset_name == "sonnet":
+    if args.dataset_name == "custom":
+        dataset = CustomDataset(dataset_path=args.dataset_path)
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            skip_chat_template=args.custom_skip_chat_template,
+        )
+
+    elif args.dataset_name == "sonnet":
         dataset = SonnetDataset(dataset_path=args.dataset_path)
         # For the "sonnet" dataset, formatting depends on the backend.
         if args.backend == "openai-chat":
@@ -762,6 +773,10 @@ def main(args: argparse.Namespace):
     if "temperature" not in sampling_params:
         sampling_params["temperature"] = 0.0  # Default to greedy decoding.
 
+    if args.backend == "llama.cpp":
+        # Disable prompt caching in llama.cpp backend
+        sampling_params["cache_prompt"] = False
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -834,6 +849,8 @@ def main(args: argparse.Namespace):
             ]:
                 if field in result_json:
                     del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
 
         # Save to file
         base_model_id = model_id.split("/")[-1]
@@ -846,6 +863,7 @@ def main(args: argparse.Namespace):
         if args.result_filename:
             file_name = args.result_filename
         if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
             file_name = os.path.join(args.result_dir, file_name)
         with open(
             file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
@@ -886,7 +904,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(
@@ -1056,6 +1074,19 @@ if __name__ == "__main__":
     )
 
     # group for dataset specific arguments
+    custom_group = parser.add_argument_group("custom dataset options")
+    custom_group.add_argument(
+        "--custom-output-len",
+        type=int,
+        default=256,
+        help="Number of output tokens per request, used only for custom dataset.",
+    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help="Skip applying chat template to prompt, used only for custom dataset.",
+    )
+
     sonnet_group = parser.add_argument_group("sonnet dataset options")
     sonnet_group.add_argument(
         "--sonnet-input-len",
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 6a50f47d3951cf7f9e33b69b152efe279aa9c6c7..c1501ad52c25af1f101785e4a2143e8a5e317489 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput with structured outputs.
 
 On the server side, run one of the following commands:
@@ -11,7 +12,6 @@ On the client side, run:
         --model <your_model> \
         --dataset json \
         --structured-output-ratio 1.0 \
-        --structured-output-backend auto \
         --request-rate 10 \
         --num-prompts 1000
 
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7a13babda9d16227385e5bae3df37d5daed12b05..d19753d40e497d95c56ed1330a802d788b0d5ded 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
 
 import argparse
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index b0c4fca92c3d0035904691d5a1fc9b99e741b7a0..283f938df50af101fa2ff82f3c46e123b25e5524 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
@@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
 
 def write_to_json(filename: str, records: list) -> None:
     with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index da258f98e085f973110dc623111012f5f6b61b93..9ec270bbd2e988b459f99d876215984bf125d0f9 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 7e9f5a7fc0f464718e15e1cc024df89081da74f9..b4f3c6bf94eda0e1bf1d253def3b17d273415dcc 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Cutlass bench utils
 from collections.abc import Iterable
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 08e93837f7ddff3da18c37c482e527695dff75b2..cec422e8d597f1df353eb5a4836c8f88f1685c2a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index d31b623a1ee604c2bf0b4ab7cb90d37ffa463adb..25b96ef56620ea7dbd97846cdd57ab0e97a6dfd1 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index fce156e1c96c62bf0938facd623e40f6fc2a22a3..f62d8102e2d9f29c62f177d62453be5d8496cc1e 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index fd19b40bf252c3076f317b5482e439de2c01ebbb..b1df2f255822dad046f5dfcdc1d6538006463510 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import itertools
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 484d0cb3cba7d74db8dea04e68e17dde38be25e6..74fa56d076cf14bc066468be24f6053b45166001 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index 37a9173a1a937808271cc832ee49fe4172474601..901524214469e8677cbb58c3078e8b1d724585bb 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle as pkl
 import time
diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b964ed242edf8f16a8fd63ab27c098d11c9663f4
--- /dev/null
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from vllm.triton_utils import triton
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=[
+            "torch-bf16",
+            # "fp8-tensor-w-token-a",
+            "fp8-tensor-w-tensor-a",
+            "fp8-channel-w-token-a",
+            # "fp8-channel-w-tensor-a",
+            # "fp8-tensor-w-token-a-noquant",
+            "fp8-tensor-w-tensor-a-noquant",
+            "fp8-channel-w-token-a-noquant",
+            # "fp8-channel-w-tensor-a-noquant",
+        ],
+        line_names=[
+            "torch-bf16",
+            # "fp8-tensor-w-token-a",
+            "fp8-tensor-w-tensor-a",
+            "fp8-channel-w-token-a",
+            # "fp8-channel-w-tensor-a",
+            # "fp8-tensor-w-token-a-noquant",
+            "fp8-tensor-w-tensor-a-noquant",
+            "fp8-channel-w-token-a-noquant",
+            # "fp8-channel-w-tensor-a-noquant",
+        ],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create input tensors
+    a = torch.randn((M, K), device=device, dtype=dtype)
+    b = torch.randn((N, K), device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if "torch-bf16" in provider:
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+
+    elif "fp8" in provider:
+        # Weights are always quantized ahead of time
+        if "noquant" in provider:
+            # For no quantization, we just measure the GEMM
+            if "tensor-w-token-a" in provider:
+                # Dynamic per-token quant for A, per-tensor quant for B
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
+                assert scale_b_fp8.numel() == 1
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                    a, use_per_token_if_dynamic=True
+                )
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "tensor-w-tensor-a" in provider:
+                # Static per-tensor quantization with fixed scales
+                # for both A and B
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                assert scale_b_fp8.numel() == 1
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-token-a" in provider:
+                # Static per-channel quantization for weights, per-token
+                # quant for A
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                    a, use_per_token_if_dynamic=True
+                )
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-tensor-a" in provider:
+                # Static per-channel quantization for weights, per-tensor
+                # quant for A
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+
+                def run_quant():
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        else:
+            # In these cases, we quantize the activations during the GEMM call
+            if "tensor-w-token-a" in provider:
+                # Dynamic per-token quant for A, per-tensor quant for B
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
+                assert scale_b_fp8.numel() == 1
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                        a, use_per_token_if_dynamic=True
+                    )
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "tensor-w-tensor-a" in provider:
+                # Static per-tensor quantization with fixed scales
+                # for both A and B
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                assert scale_b_fp8.numel() == 1
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-token-a" in provider:
+                # Static per-channel quantization for weights, per-token
+                # quant for A
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
+                        a, use_per_token_if_dynamic=True
+                    )
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+            elif "channel-w-tensor-a" in provider:
+                # Static per-channel quantization for weights, per-tensor
+                # quant for A
+                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
+                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
+                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
+                assert scale_b_fp8.numel() == N
+
+                def run_quant():
+                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        b_fp8 = b_fp8.t()
+
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: run_quant(), quantiles=quantiles
+        )
+
+    # Calculate TFLOP/s, two flops per multiply-add
+    tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
+    return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=[*WEIGHT_SHAPES.keys()],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path=f"bench_fp8_res_n{N}_k{K}",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index e9934aa479dd6bb21d0e6e45d5e15999c0d23dda..42de062b08e424619b010f9170725b6785a35f58 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
index d40ab70ec539b27af09739167a7c11900e6e936d..97ee060341373022b06c6ce62600d5c23c771c37 100644
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index d39d8a6e3aba31612817e62f50ed8bb911fb66e5..35c20ee41b9a94c10f1367a45fe391cc15e9ffec 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
 kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@@ -90,7 +91,7 @@ def bench_run(
 
     score = torch.randn((m, num_experts), device=device, dtype=dtype)
 
-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
     quant_blocksize = 16
     w1_blockscale = torch.empty(
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 2197bceabe6c034eb591fae5308e7d483016c317..acabe6c1ddb0a18aac64cef4326fb14220fe4858 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    cutlass_moe_fp8,
     fused_experts,
     fused_topk,
 )
@@ -69,18 +70,9 @@ def bench_run(
     w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
     w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
 
-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
     for expert in range(num_experts):
         w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
         w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
-    w1_q_notransp = w1_q.clone()
-    w2_q_notransp = w2_q.clone()
-    w1_q = w1_q.transpose(1, 2)
-    w2_q = w2_q.transpose(1, 2)
 
     score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
 
@@ -121,10 +113,6 @@ def bench_run(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
         num_repeats: int,
     ):
         for _ in range(num_repeats):
@@ -132,14 +120,10 @@ def bench_run(
                 a,
                 w1,
                 w2,
-                w1_scale,
-                w2_scale,
                 topk_weights,
                 topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                 a1_scale=a_scale,
             )
 
@@ -152,10 +136,6 @@ def bench_run(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
     ):
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@@ -164,14 +144,10 @@ def bench_run(
                 a,
                 w1_q,
                 w2_q,
-                w1_scale,
-                w2_scale,
                 topk_weights,
                 topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                 a1_scale=a_scale,
             )
 
@@ -217,10 +193,6 @@ def bench_run(
             w2_scale,
             topk_weights,
             topk_ids,
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
         )
     torch.cuda.synchronize()
 
@@ -229,8 +201,8 @@ def bench_run(
     with torch.cuda.graph(triton_graph, stream=triton_stream):
         run_triton_from_graph(
             a,
-            w1_q_notransp,
-            w2_q_notransp,
+            w1_q,
+            w2_q,
             topk_weights,
             topk_ids,
             w1_scale,
@@ -249,18 +221,12 @@ def bench_run(
         "w2": w2,
         "score": score,
         "topk": topk,
-        "w1_q_notransp": w1_q_notransp,
-        "w2_q_notransp": w2_q_notransp,
         # Cutlass params
         "a_scale": a_scale,
         "w1_q": w1_q,
         "w2_q": w2_q,
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
-        "ab_strides1": ab_strides1,
-        "c_strides1": c_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -278,8 +244,8 @@ def bench_run(
     # Warmup
     run_triton_moe(
         a,
-        w1_q_notransp,
-        w2_q_notransp,
+        w1_q,
+        w2_q,
         topk_weights,
         topk_ids,
         w1_scale,
@@ -290,7 +256,7 @@ def bench_run(
 
     results.append(
         benchmark.Timer(
-            stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -321,16 +287,12 @@ def bench_run(
         w2_scale,
         topk_weights,
         topk_ids,
-        ab_strides1,
-        c_strides1,
-        ab_strides2,
-        c_strides2,
         num_warmup,
     )
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index f21ca97eeb8a9b2abbf04f979f90ece9003f1cef..69978ec6b23e94ce21d8f3dfb633317b3803e6ef 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 6c1284930c1ec3f6963dd9ec325e72179d14da5f..3d38d4b3534e8993abc8aff85712230e7262a740 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index f8f1db04790bfb714751ef29da3391440a01e378..0f896f187ecb9783b18036779d9ebf56a1783a30 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index b17baff2e5f5d36042e4737454b75c4dd5868cd6..9ea1fddae2a3b0b7ff595a70c0c61f9355f0ff41 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.utils.benchmark as benchmark
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2f7660858f574791d1477d69f936b354eba6f20..cef53b183cef3bd9afadc2f45bbf9624c58e6ba0 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
@@ -6,7 +7,6 @@ import time
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from types import SimpleNamespace
 from typing import Any, TypedDict
 
 import ray
@@ -42,7 +42,7 @@ def benchmark_config(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     num_iters: int = 100,
-    block_quant_shape: List[int] = None,
+    block_quant_shape: list[int] = None,
     use_deep_gemm: bool = False,
 ) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@@ -399,7 +399,7 @@ class BenchmarkWorker:
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
-        block_quant_shape: List[int] = None,
+        block_quant_shape: list[int] = None,
         use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
@@ -531,7 +531,7 @@ def save_configs(
     dtype: torch.dtype,
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
-    block_quant_shape: List[int],
+    block_quant_shape: list[int],
 ) -> None:
     dtype_str = get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
     config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
     if args.model_prefix:
         config = getattr(config, args.model_prefix)
-    config = SimpleNamespace(**config)
 
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
@@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = (
-        torch.float16
-        if current_platform.is_rocm()
-        else getattr(torch, config.torch_dtype)
-    )
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 333986fdf5eff52574194bbea7c51a8000c37424..dba1f3943b96c370acf1262273695dbdb4097711 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 from typing import Any, TypedDict
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 54f05e72322654ff432dfae2c5d8bb52e4ce6ef1..7e0376c18ecc79690ca49e3995258c5e749bbac7 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 import time
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 2463dfebe83cce8511a68775e4a4666a6607f892..6ab26f5f1adf73ccdc68c0d5f20d622d0c09b7ba 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index d720083b615037d9f0236086cf42d392a3494b00..4cf633a81358d80202fa383d7312dac4a818ea62 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from typing import Optional, Union
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 110d36db157fdf70afa3a2dde76e8c31cfab6922..b81baf17a8c674edd72fe9a6397a0b027a8d33b9 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate
 from typing import Optional
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
     seed: int,
     device: str,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 70190ba24d9dffbbe05a4e2107a89d020de557b0..18c459c31d3f84d1ca7b7e31d00d50b521177e96 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 WEIGHT_SHAPES = {
     "ideal": [[4 * 256 * 32, 256 * 32]],
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 6315c1ee6cdd6893e9d5e513c7dadc79f228f7d3..4fcdbadd65ecd366f51ce5d9053cc700854dcb71 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from sglang quantization/tuning_block_wise_kernel.py
 
 import argparse
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index e377648254512dab59b6b97b678a52872cdc2b22..e67ce054531818d6d0b59a21c0bd192c4290763a 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 0c86e40729579994ed54432dbfbbedf15b3ba2d6..9a4da0ef5a85d0df2178d5e990ad177e3f76bf37 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import pickle
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 877a29feed9dfe226a28baaac2d4d5a72ef23943..4bbb36bb4359259944310adb8e1047a02f126dea 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from collections.abc import Iterable
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 89b05d5882a381ce789230a64672811382ebdb8b..a27f02394afbdfa4f3a352a599db349a2601bea5 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
@@ -48,4 +49,50 @@ WEIGHT_SHAPES = {
         ([16384, 106496], 1),
         ([53248, 16384], 0),
     ],
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
 }
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index d5701a8fbd6d85f75c4830ca379590306fe59eae..0957a9c65f06c912ae73588e017602af72ae9772 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import cProfile
 import pstats
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fb763db9fc359ef9dbe96b688fe914e36245011d..5cd2c98f234381818063d7947a05ca28cb544129 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
 else()
     find_isa(${CPUINFO} "avx2" AVX2_FOUND)
     find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "Power11" POWER11_FOUND)
     find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
     find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
     find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
     
-elseif (POWER9_FOUND OR POWER10_FOUND)
+elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     message(STATUS "PowerPC detected")
-    # Check for PowerPC VSX support
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mvsx"
-        "-mcpu=native"
-        "-mtune=native")
+    if (POWER9_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power9"
+            "-mtune=power9")
+    elseif (POWER10_FOUND OR POWER11_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power10"
+            "-mtune=power10")
+    endif()
 
 elseif (ASIMD_FOUND)
     message(STATUS "ARMv8 or later architecture detected")
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index b04e4c2d06edc90b443d91e34491a0f67431bf67..a4edd5b96fe29ccfece190016aad0cb6c7d283d6 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -46,22 +46,38 @@ else()
 endif()
 
 
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
+
+# Make sure vllm-flash-attn install rules are nested under vllm/
+# This is here to support installing all components under the same prefix with cmake --install.
+# setup.py installs every component separately but uses the same prefix for all.
+# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
+# and these statements don't hurt when installing neither component.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
+
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
 # case only one is built, in the case both are built redundant work is done)
 install(
   DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
+  DESTINATION vllm/vllm_flash_attn
   COMPONENT _vllm_fa2_C
   FILES_MATCHING PATTERN "*.py"
 )
 
 install(
   DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
+  DESTINATION vllm/vllm_flash_attn
   COMPONENT _vllm_fa3_C
   FILES_MATCHING PATTERN "*.py"
 )
diff --git a/cmake/hipify.py b/cmake/hipify.py
index a15577125eb1fef3584fe3ce11594dd353dd2071..55d378f5b11137739d21beb6d80df672040f66a4 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 375d254ba343ff9c0e968cb8c7984e1a56e0dd6a..8002dd74477b6f2d0a4c3e3d64bb7edf8128c635 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
   add_custom_target(
     hipify${NAME}
-    COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
     DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
     BYPRODUCTS ${HIP_SRCS}
     COMMENT "Running hipify on ${NAME} extension source files.")
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
index 6743af0cf2dbab816b4c204320ebf484ea8516fb..f4b6b19f4b232c8bfd9e66ecf693a0dbfa3a1068 100644
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
       {static_cast<ElementOut*>(out.data_ptr()), stride_O,
        static_cast<ElementAcc*>(nullptr), stride_LSE},
       hw_info,
-      -1,       // split_kv
+      1,        // split_kv
       nullptr,  // is_var_split_kv
   };
   // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index d64f0d0a5c2a046bcf3210aab83bf1d6f9380232..1dd7101acc27ded603cd1999104e599571345308 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from typing import Union
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 15f008d4f61ed66ad7e0df5643796ed6176b2af5..49f33718a21e815f8219606bc11952f1ab0fed02 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 8fda434d452f9bfa085f36d0d22eb62f0af24185..c4faef731060a6d60fdfacb4d5f307bcf8d4e452 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -30,4 +30,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_K, int64_t bit);
 #endif
 
-bool moe_permute_unpermute_supported();
\ No newline at end of file
+bool moe_permute_unpermute_supported();
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor);
\ No newline at end of file
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index 9a7465261abfeb4ab1f36bf8bce86e570193c034..68f429fac18ab892bd8a7ebc7369841512f96773 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -130,6 +130,62 @@ void moe_unpermute(
   });
 }
 
+template <typename T>
+__global__ void shuffleInputRowsKernel(const T* input,
+                                       const int32_t* dst2src_map, T* output,
+                                       int64_t num_src_rows,
+                                       int64_t num_dst_rows, int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor) {
+  TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
+              "Input and output tensors must have the same data type");
+
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  int64_t const blocks = output_tensor.size(0);
+  int64_t const threads = 256;
+  int64_t const num_dest_rows = output_tensor.size(0);
+  int64_t const num_src_rows = input_tensor.size(0);
+  int64_t const num_cols = input_tensor.size(1);
+
+  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
+              "num_cols must be divisible by 128 / "
+              "sizeof(input_tensor.scalar_type()) / 8");
+
+  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+        dst2src_map.data_ptr<int32_t>(),
+        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+        num_dest_rows, num_cols);
+  });
+}
+
 #else
 
 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h
index 41932cdd85bcd35b1623943695d05c6935cc6038..d0f1ea4aded3388353baaf9bb5ef49b893363002 100644
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -14,12 +14,13 @@
     __VA_ARGS__();                                         \
     break;                                                 \
   }
-#define MOE_DISPATCH_FLOAT_CASE(...)                          \
-  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
-  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
-  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+#define MOE_DISPATCH_FLOAT_CASE(...)                            \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)         \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)          \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)      \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)   \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
 
 #define MOE_DISPATCH(TYPE, ...) \
   MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@@ -39,6 +40,11 @@ template <>
 struct ScalarType2CudaType<at::ScalarType::BFloat16> {
   using type = __nv_bfloat16;
 };
+// uint8 for packed fp4
+template <>
+struct ScalarType2CudaType<at::ScalarType::Byte> {
+  using type = uint8_t;
+};
 
 // #if __CUDA_ARCH__ >= 890
 // fp8
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index a9379032245d9b74838ff30398cf796a6568ca72..10be47966f61189e995d45a8010a126edbfcc34c 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -516,9 +516,8 @@ void topk_softmax(
             topk,
             stream);
     }
-    else
+    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
     {
-        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
         vllm::moe::topkGatingSoftmaxKernelLauncher(
             gating_output.data_ptr<float>(),
             topk_weights.data_ptr<float>(),
@@ -530,4 +529,17 @@ void topk_softmax(
             topk,
             stream);
     }
+    else {
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7d35ec79ead487ad0689fff3a4e19a0d25043635..a74eb3720cf1cf48433813da0a21ab0d79c87521 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -81,6 +81,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def("moe_permute_unpermute_supported() -> bool");
   m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
 
+  // Row shuffle for MoE
+  m.def(
+      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
+      "output_tensor) -> ()");
+  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+
 #endif
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index fe38e83ffbeb0b207813402a0858de0a713155da..6b3d50ae8bfd85ae22286814a8e6a8b01da2a297 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void apply_repetition_penalties_(torch::Tensor& logits,
+                                 const torch::Tensor& prompt_mask,
+                                 const torch::Tensor& output_mask,
+                                 const torch::Tensor& repetition_penalties);
+
 // void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 //                                torch::Tensor& weight, torch::Tensor& scale,
 //                                double epsilon);
@@ -233,7 +238,8 @@ void cutlass_moe_mm(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
 
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@@ -245,7 +251,16 @@ void get_cutlass_moe_mm_data(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k);
 
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
index 84492553c02f2177e3fa81da1033fcb612e6f98c..4a8a5ed02d6ce454b33e2efbd353cfccfb98937e 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
@@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                            torch::Tensor const& b,
                                            torch::Tensor const& a_scales,
                                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(
-      a.size(0) % 4 == 0,
-      "Input tensor must have a number of rows that is a multiple of 4. ",
-      "but got: ", a.size(0), " rows.");
   if (out.dtype() == torch::kBFloat16) {
     cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
         out, a, b, a_scales, b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index ef324364c6d5e01cc3f32222f07cfe0fdd20f589..c841125dbb734f31d97f64d2d94c73f841a7bae1 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "cuda_utils.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 
@@ -22,49 +23,49 @@ namespace vllm {
 
 using namespace cute;
 
-template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
-          class ClusterShape, typename EpilogueScheduler,
-          typename MainloopScheduler>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
 struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
   using ElementAB = cutlass::float_e4m3_t;
 
   using ElementA = ElementAB;
   using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
   static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
 
   using ElementB = ElementAB;
   using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
   static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
 
-  using ElementC = void;
   using ElementD = OutType;
   using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
   static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
 
+  using ElementC = void; // TODO: support bias
   using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
   static constexpr int AlignmentC = AlignmentD;
 
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBlockScale = float;
 
-  // MMA and Cluster Tile Shapes
-  // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
-  // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
-  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
-  static constexpr int ScaleGranularityM =
-      size<0>(MmaTileShape{}) / ScaleMsPerTile;
-  static constexpr int ScaleGranularityN =
-      size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
-  static constexpr int ScaleGranularityK =
-      size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
-
-  // Shape of the threadblocks in a cluster
-  using ClusterShape_MNK = ClusterShape;
-
-  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-      ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-      cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
   using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
   using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
 
@@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {
 
   static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
   using ElementScalar = float;
-  // clang-format off
   using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       ArchTag,
@@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
       ElementAccumulator,
       ElementCompute,
       ElementC,
-      LayoutC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
       AlignmentC,
       ElementD,
-      LayoutD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
       AlignmentD,
       EpilogueScheduler,
       DefaultOperation
   >::CollectiveOp;
  
   using StageCountType = cutlass::gemm::collective::StageCountAuto; 
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-      ArchTag,
-      OperatorClass,
-      ElementA,
-      cute::tuple<LayoutA, LayoutSFA>,
-      AlignmentA,
-      ElementB,
-      cute::tuple<LayoutB, LayoutSFB>,
-      AlignmentB,
-      ElementAccumulator,
-      MmaTileShape,
-      ClusterShape,
-
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
           cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-      MainloopScheduler
-  >::CollectiveOp;
-  // clang-format on
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;
 
   using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& b,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
   using GemmKernel = typename Gemm::GemmKernel;
   using StrideA = typename Gemm::GemmKernel::StrideA;
   using StrideB = typename Gemm::GemmKernel::StrideB;
@@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   using ElementD = typename Gemm::ElementD;
 
   int32_t m = a.size(0), n = b.size(1), k = a.size(1);
-  auto prob_shape = cute::make_shape(m, n, k, 1);
 
   StrideA a_stride;
   StrideB b_stride;
@@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   b_stride =
       cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
   c_stride =
-      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
 
-  LayoutSFA layout_SFA =
+  LayoutSFA layout_SFA = swap_ab ? 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
-  LayoutSFB layout_SFB =
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
   auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
 
-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr,        a_stride,   b_ptr,        b_stride,
-      a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
@@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
                                                torch::Tensor const& b,
                                                torch::Tensor const& a_scales,
                                                torch::Tensor const& b_scales) {
-  auto m = a.size(0);
-  auto k = a.size(1);
-  auto n = b.size(1);
-  int sms;
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
 
-  auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
-    return std::ceil(static_cast<float>(m) / tile1SM) *
-               std::ceil(static_cast<float>(n) / tile1SM) >=
-           sms;
-  };
-  bool use_2sm = should_use_2sm(m, n);
-  if (use_2sm) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
-        Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
-        out, a, b, a_scales, b_scales);
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
   } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
     cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
-        Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
         out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index 468b77d9593bc5ca42cde31f7866b7ff3f69e85e..6da2da63407590b9a4bd80dac08db1552e764609 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
+  // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
                             KernelSchedule, EpilogueSchedule>;
 };
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
@@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
-  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, b, std::forward<EpilogueArgs>(args)...);
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
 }
 
 template <template <typename, typename, typename> typename Epilogue,
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
index 2b8bc3fb0b26169953ea14d282ab7a88d27e172f..c88e134ae406bd22e97fa7eb8dd742664d2cc448 100644
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
   TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
   TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
   if (n >= 8192) {
     cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else if (k >= 8192) {
     cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else if (m <= 16) {
     cutlass_group_gemm_caller<Cutlass3xGemmM16>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else {
     cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   }
 }
 
@@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   if (out_tensors.dtype() == torch::kBFloat16) {
     run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else {
     run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   }
 }
 
@@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
-                       c_strides);
+                       c_strides, per_act_token, per_out_ch);
 }
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
index db827b7c5e186ed36658e352416e2e262c44617a..bbd82d72e95bd4709e1d58db1847cc6e297fe154 100644
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
@@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
   int k_size = a_tensors.size(1);
   int n_size = out_tensors.size(1);
 
-  bool per_act_token = a_scales.numel() != 1;
-  bool per_out_ch = b_scales.numel() != num_experts;
-
   auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
 
   auto options_int =
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 894727383a639b0d2c0858c4a884092aec067811..32254641cc3821aae02cff7a7260be764454fa7f 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -7,7 +7,7 @@
 
 constexpr uint64_t THREADS_PER_EXPERT = 512;
 
-__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
+__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
                                       int32_t* problem_sizes1,
                                       int32_t* problem_sizes2,
                                       int32_t* atomic_buffer,
@@ -45,7 +45,24 @@ __global__ void compute_expert_offsets(
   }
 }
 
-__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+__global__ void compute_expert_blockscale_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* blockscale_offsets, int32_t* atomic_buffer,
+    const int num_experts) {
+  int32_t tot_offset = 0;
+  int32_t tot_offset_round = 0;
+  expert_offsets[0] = 0;
+  blockscale_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
+    blockscale_offsets[i + 1] = tot_offset_round;
+  }
+}
+
+__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
                                   const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
                                   int32_t* output_permutation,
@@ -77,7 +94,8 @@ void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k) {
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
   auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
   auto options_int32 =
       torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@@ -85,19 +103,61 @@ void get_cutlass_moe_mm_data_caller(
 
   int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
   compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
       static_cast<int32_t*>(problem_sizes1.data_ptr()),
       static_cast<int32_t*>(problem_sizes2.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
-  compute_expert_offsets<<<1, 1, 0, stream>>>(
-      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(expert_offsets.data_ptr()),
-      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  if (blockscale_offsets.has_value()) {
+    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  } else {
+    compute_expert_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  }
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
       static_cast<const int32_t*>(expert_offsets.data_ptr()),
       static_cast<int32_t*>(input_permutation.data_ptr()),
       static_cast<int32_t*>(output_permutation.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
       topk_ids.size(1));
 }
+
+__global__ void compute_pplx_data(int32_t* expert_offsets,
+                                  int32_t* problem_sizes1,
+                                  int32_t* problem_sizes2,
+                                  const int32_t* __restrict__ expert_num_tokens,
+                                  const int padded_m, const int n,
+                                  const int k) {
+  int expert_idx = threadIdx.x;
+
+  expert_offsets[expert_idx] = expert_idx * padded_m;
+  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+  problem_sizes1[expert_idx * 3 + 2] = k;
+  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes2[expert_idx * 3 + 1] = k;
+  problem_sizes2[expert_idx * 3 + 2] = n;
+}
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
+
+  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+      k);
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index e9b408fbf2ee0099e02ad35c68da86fc505a39e5..348525810810ccfc6040e139b0d7b320f3d9b5a7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -36,7 +36,8 @@ void cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
 
 #endif
 
@@ -54,7 +55,16 @@ void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -206,12 +216,13 @@ void cutlass_moe_mm(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
   cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                       expert_offsets, problem_sizes, a_strides, b_strides,
-                      c_strides);
+                      c_strides, per_act_token, per_out_ch);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -224,7 +235,8 @@ void get_cutlass_moe_mm_data(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k) {
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
@@ -232,7 +244,8 @@ void get_cutlass_moe_mm_data(
     (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
   get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                  problem_sizes2, input_permutation,
-                                 output_permutation, num_experts, n, k);
+                                 output_permutation, num_experts, n, k,
+                                 blockscale_offsets);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -242,6 +255,29 @@ void get_cutlass_moe_mm_data(
       version_num, ". Required capability: 90");
 }
 
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
+                                      problem_sizes2, expert_num_tokens,
+                                      num_local_experts, padded_m, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
+      "for CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index eceb3a8ea05da2b189e8ba0b84362b730541b535..f3f9f669e00a4aa51b2a0760c549df0115ad3582 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   fp8_type* __restrict__ token_output = &out[offset];
 
   // For vectorization, token_input and token_output pointers need to be
-  // aligned at 8-byte and 4-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 4 == 0;
+  // aligned at 32-byte and 16-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 16 == 0;
 
   float absmax_val = 0.0f;
   if (can_vectorize) {
@@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
       float const x = static_cast<float>(token_input[i]);
-      absmax_val = max(absmax_val, fabs(x));
+      absmax_val = fmaxf(absmax_val, fabsf(x));
     }
   }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
+  using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage reduceStorage;
   float const block_absmax_val_maybe =
       BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
   __shared__ float token_scale;
   if (tid == 0) {
     if (scale_ub) {
-      token_scale = min(block_absmax_val_maybe, *scale_ub);
+      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
     } else {
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
-                      min_scaling_factor<fp8_type>::val());
+    token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
+                        min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
@@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor const& scale)  // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
@@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                               torch::Tensor const& input,  // [..., d]
                               torch::Tensor& scale)        // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
@@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant(
 
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
+  int const block_size = 256;
   dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  dim3 const block(std::min(hidden_size, block_size));
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index def8b31b27546744641689d93b90153e2d4d2e7b..d36f94a8f10d6490c408c3e18339bde2778f787d 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
   }
 
   float r =
-      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
   return static_cast<fp8_type>(r);
 #else
@@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
-  __shared__ float cache[1024];
+  __shared__ float cache[256];
   int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
 
   // First store maximum for all values processes by
@@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   scalar_t tmp = 0.0;
   while (i < num_elems) {
     float x = static_cast<float>(input[i]);
-    tmp = max(tmp, fabs(x));
+    tmp = fmaxf(tmp, fabsf(x));
     i += blockDim.x * gridDim.x;
   }
   cache[threadIdx.x] = tmp;
@@ -100,25 +100,27 @@ template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
                                 int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
 
-  int64_t const num_vec_elems = num_elems >> 2;
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
   float absmax_val = 0.0f;
 
-#pragma unroll 4
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    absmax_val = max(absmax_val, fabs(in_vec.x));
-    absmax_val = max(absmax_val, fabs(in_vec.y));
-    absmax_val = max(absmax_val, fabs(in_vec.z));
-    absmax_val = max(absmax_val, fabs(in_vec.w));
+    scalarxN_t in_vec = vectorized_in[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
+    }
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    absmax_val = max(absmax_val, fabs(input[i]));
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
   }
 
   return absmax_val;
@@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
-  using float8x4_t = q8x4_t<fp8_type>;
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
+  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
 
-  int64_t const num_vec_elems = num_elems >> 2;
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
 
-#pragma unroll 4
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    float8x4_t out_vec;
-
-    out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.x), scale);
-    out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.y), scale);
-    out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.z), scale);
-    out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.w), scale);
+    scalarxN_t in_vec = vectorized_in[i];
+    float8xN_t out_vec;
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+          static_cast<float>(in_vec.val[j]), scale);
+    }
     vectorized_out[i] = out_vec;
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
     out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(input[i]), scale);
   }
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index e6d23cd24e178aa53c73a872dae10de9d2a9ddbc..3f188872d80d3a87989e4036b881f30e3efddcac 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   // sum of squares
   float ss = 0.0f;
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
 
 #pragma unroll 4
@@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
     vec4_t<scalar_t> in = vec_input[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
     }
 
-    ss += x.x * x.x;
-    ss += x.y * x.y;
-    ss += x.z * x.z;
-    ss += x.w * x.w;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      ss += x.val[j] * x.val[j];
+    }
   }
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales(
 
   constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
 
@@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales(
     vec4_t<scalar_t> const w = vec_weight[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
     }
 
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      block_absmax_val_maybe =
+          fmaxf(block_absmax_val_maybe,
+                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+    }
   }
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
   }
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
 
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
@@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     vec4_t<scalar_t> const w = vec_weight[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
-      // Update residual
-      r.x = static_cast<scalar_t>(x.x);
-      r.y = static_cast<scalar_t>(x.y);
-      r.z = static_cast<scalar_t>(x.z);
-      r.w = static_cast<scalar_t>(x.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+// Update residual
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        r.val[j] = static_cast<scalar_t>(x.val[j]);
+      }
       vec_residual[i] = r;
     }
 
     q8x4_t<scalar_out_t> out;
-    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.x * rms) * w.x, scale);
-    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.y * rms) * w.y, scale);
-    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.z * rms) * w.z, scale);
-    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
+    }
     vec_output[i] = out;
   }
 }
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 4ac7121ab4e1b63316942cf658e77f32a66681d1..18fb6c1a81f84bbe5ace6c9cc5ac573228077278 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 3114e14baa0c5653c0e9e0cb86eec057ca55b752..9af7833d09f32b528f39d5c56a5c899f6d2e6504 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import math
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
index 866da10b5bc14f3a8fd58d9bfc4fdd8084d83294..11d57a5fafe8941452080e65be15a73e271b46ee 100644
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@@ -10,23 +10,22 @@
 namespace vllm {
 
 // Vectorization containers
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
+template <typename scalar_t, size_t vec_size>
+struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
+  scalar_t val[vec_size];
 };
 
-template <typename quant_type_t>
-struct __align__(4) q8x4_t {
+template <typename quant_type_t, size_t vec_size>
+struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
   static_assert(std::is_same_v<quant_type_t, int8_t> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
-  quant_type_t x;
-  quant_type_t y;
-  quant_type_t z;
-  quant_type_t w;
+  quant_type_t val[vec_size];
 };
 
+template <typename scalar_t>
+using vec4_t = vec_n_t<scalar_t, 4>;
+template <typename quant_type_t>
+using q8x4_t = q8_n_t<quant_type_t, 4>;
+
 }  // namespace vllm
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index b3717892db784a125a8795223abd88c5be06b28e..e31aa0162628f397c0c16d8303d40858b90aefb8 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -13,14 +13,34 @@
 #include "dispatch_utils.h"
 #include "quantization/fp8/common.cuh"
 
-#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
-  #define __HIP__MI300_MI250__
+#if defined(__HIPCC__) && \
+    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__GFX9__
 #endif
 
-#if defined(__HIPCC__) && defined(__gfx942__)
-  #define __HIP__MI300__
+#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__MI3XX__
 #endif
 
+#if defined(__gfx950__)
+  #define LDS_SIZE 160 * 1024
+#else
+  #define LDS_SIZE 64 * 1024
+#endif
+
+int get_lds_size() {
+  static bool is_cached = false;
+  static int result;
+  if (is_cached == false) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    std::string device_arch = dprops->gcnArchName;
+    size_t substring = device_arch.find("gfx95");
+    result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
+    is_cached = true;
+  }
+  return result;
+}
+
 #if defined(NDEBUG)
   #undef NDEBUG
   #include <assert.h>
@@ -267,7 +287,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
     V0 += (s.x + s.y);                                                        \
   }
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets cases where A[] fits LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -275,7 +295,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
                      const scalar_t* __restrict__ A, scalar_t* C,
                      const int _WvPrGrp, const int CuCount) {
-  #if defined(__HIP__MI300__)
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
   constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
   #else
   constexpr bool use_mfma = false;
@@ -295,13 +316,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   };
 
   //----------------------------------------------------
-  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Reserving 64/160 KB of LDS to have 1 WG / CU
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
   //	     then this is not goint to work!
   //----------------------------------------------------
-  __shared__ scalar_t s[1024 * 32];
+  __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
   // Fetch the activation matrix to LDS
@@ -312,11 +333,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // - Then the WG will move to another 8 K elements
   // TODO: Logic below will only work when K is multiple of 8
   //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
        k += THRDS * WvPrGrp * A_CHUNK) {
     uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
 
-    if (k_in >= min(K * N, 32 * 1024)) break;
+    if (k_in >= min(K * N, max_lds_len)) break;
 
     *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
   }
@@ -517,7 +538,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
@@ -525,9 +546,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
                                  const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets cases where A[] marginally exceeds LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -535,7 +556,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitK_hf_(const int K, const int M, const scalar_t* B,
                  const scalar_t* __restrict__ A, scalar_t* C,
                  const int _WvPrGrp, const int CuCount) {
-  #if defined(__HIP__MI300__)
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
   constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
   #else
   constexpr bool use_mfma = false;
@@ -561,7 +583,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // TODO: When activation matrix is larger than 64 KB
   //	     then this is not goint to work!
   //----------------------------------------------------
-  __shared__ scalar_t s[1024 * 32];
+  __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
   // Computation of columns that need to be committed to memory!
@@ -598,11 +620,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // - Then the WG will move to another 8 K elements
   // TODO: Logic below will only work when K is multiple of 8
   //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
        k += THRDS * WvPrGrp * A_CHUNK) {
     uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
 
-    if (k_in >= min(K * N, 32 * 1024)) break;
+    if (k_in >= min(K * N, max_lds_len)) break;
 
     *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
   }
@@ -686,7 +708,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         // Fetch A activation matrix in interleaved fashion from LDS or memory
 
         for (int n = 0; n < N; n++) {
-          if (k_ + K * n < 32 * 1024)
+          if (k_ + K * n < max_lds_len)
             bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
           else
             bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
@@ -817,7 +839,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   }
 }
 
-#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
@@ -825,9 +847,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
                              const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets big A[] cases, where it is much larger than LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -835,7 +857,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
                      const scalar_t* __restrict__ A, scalar_t* C,
                      const int _WvPrGrp, const int CuCount) {
-  #if defined(__HIP__MI300__)
+  constexpr int max_lds_len = LDS_SIZE / 2;
+  #if defined(__HIP__MI3XX__)
   constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
   #else
   constexpr bool use_mfma = false;
@@ -855,13 +878,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   };
 
   //----------------------------------------------------
-  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Reserving 64/160 KB of LDS to have 1 WG / CU
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
   //	     then this is not goint to work!
   //----------------------------------------------------
-  __shared__ scalar_t s[1024 * 32];
+  __shared__ scalar_t s[max_lds_len];
 
   //----------------------------------------------------
   // Computation of columns that need to be committed to memory!
@@ -902,11 +925,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   //----------------------------------------------------
   #define PCML
   #ifndef PCML
-  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+  for (uint32_t k = 0; k < min(K * N, max_lds_len);
        k += THRDS * WvPrGrp * A_CHUNK) {
     uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
 
-    if (k_in >= min(K * N, 32 * 1024)) break;
+    if (k_in >= min(K * N, max_lds_len)) break;
 
     *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
   }
@@ -916,7 +939,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #define TUC (THRDS * UNRL * A_CHUNK)
   uint32_t kBase = 0;
   // find biggest k size that fits in LDS
-  uint32_t kFit = (32 * 1024) / N;
+  uint32_t kFit = (max_lds_len) / N;
   // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
   // of TUC
   kFit = (kFit % TUC == 0)
@@ -1164,7 +1187,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     }
   }
 }
-#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
@@ -1172,7 +1195,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
                                  const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
 int mindiv(int N, int div1, int div2) {
   int nPrRnd = div1 * div2;
@@ -1222,17 +1245,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int max_lds_len = get_lds_size() / 2;
 
 #define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
                  _N)                                                          \
   {                                                                           \
     dim3 block(64, _WvPrGrp);                                                 \
-    if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) {                \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
       wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
           <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
                                        CuCount);                              \
-    } else if (K_in * N_in <= 32 * 1024 * 1.2) {                              \
+    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
       wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
           <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
@@ -1272,7 +1296,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
   return out_c;
 }
 
-#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1281,6 +1305,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                       const float* __restrict__ s_A,
                       const float* __restrict__ s_B, const int _WvPrGrp,
                       const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE;
   using scalar8 =
       __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
   using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
@@ -1296,10 +1321,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     scalar8 h8;
   };
 
-  __shared__ fp8_t s[1024 * 64];
+  __shared__ fp8_t s[max_lds_len];
 
   for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
     *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
   }
   __syncthreads();
@@ -1436,7 +1461,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
@@ -1446,9 +1471,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
                                   const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
 
-#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1456,6 +1481,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                   const fp8_t* __restrict__ A, scalar_t* C,
                   const float* __restrict__ s_A, const float* __restrict__ s_B,
                   const int _WvPrGrp, const int CuCount) {
+  constexpr int max_lds_len = LDS_SIZE;
   using scalar8 =
       __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
   using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
@@ -1471,10 +1497,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     scalar8 h8;
   };
 
-  __shared__ fp8_t s[1024 * 64];
+  __shared__ fp8_t s[max_lds_len];
 
   for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
     *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
   }
   __syncthreads();
@@ -1517,7 +1543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         uint32_t k_ = k + threadIdx.x * A_CHUNK;
         if (k_ >= K) break;
         for (int n = 0; n < N; n++) {
-          if (k_ + K * n < 64 * 1024)
+          if (k_ + K * n < max_lds_len)
             bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
           else
             bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
@@ -1608,7 +1634,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
@@ -1618,7 +1644,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
                               const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
 
 void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
                at::Tensor& scale_a, at::Tensor& scale_b,
@@ -1638,12 +1664,13 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
   dim3 grid(CuCount);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int max_lds_len = get_lds_size();
 
 #define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
                   _N)                                                          \
   {                                                                            \
     dim3 block(64, _WvPrGrp);                                                  \
-    if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) {                 \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
       wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
           <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee5793dda0ef8fbac9c9a66c762933f2f879d19f
--- /dev/null
+++ b/csrc/sampler.cu
@@ -0,0 +1,86 @@
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+template <typename scalar_t>
+__global__ void apply_repetition_penalties_kernel(
+    scalar_t* __restrict__ logits,         // [num_seqs, vocab_size]
+    const bool* __restrict__ prompt_mask,  // [num_seqs, vocab_size]
+    const bool* __restrict__ output_mask,  // [num_seqs, vocab_size]
+    const scalar_t* __restrict__ repetition_penalties,  // [num_seqs]
+    const int num_seqs, const int vocab_size, const int tile_size) {
+  // Each block handles one sequence and a tile of vocab
+  const int seq_idx = blockIdx.x;
+  if (seq_idx >= num_seqs) return;
+
+  const int tile_start = blockIdx.y * tile_size;
+  const int tile_end = min(tile_start + tile_size, vocab_size);
+
+  // Load repetition penalty for this sequence
+  const scalar_t penalty = repetition_penalties[seq_idx];
+
+  // Each thread processes multiple vocab items within the tile
+  for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
+       vocab_idx += blockDim.x) {
+    const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
+    const bool is_repeated = prompt_mask[idx] || output_mask[idx];
+    if (is_repeated) {
+      scalar_t logit = logits[idx];
+      if (logit > 0) {
+        logits[idx] = logit / penalty;
+      } else {
+        logits[idx] = logit * penalty;
+      }
+    }
+  }
+}
+
+}  // namespace vllm
+
+void apply_repetition_penalties_(
+    torch::Tensor& logits,             // [num_seqs, vocab_size], in-place
+    const torch::Tensor& prompt_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& output_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& repetition_penalties) {  // [num_seqs]
+  TORCH_CHECK(logits.is_contiguous());
+  TORCH_CHECK(prompt_mask.is_contiguous());
+  TORCH_CHECK(output_mask.is_contiguous());
+  TORCH_CHECK(repetition_penalties.is_contiguous());
+
+  int vocab_size = logits.size(-1);
+  int num_seqs = logits.size(0);
+
+  // Get number of SMs on the current device
+  int sms = 0;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
+                         logits.get_device());
+
+  // Compute tile_num and tile_size
+  int tile_num =
+      std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
+  int tile_size = (vocab_size + tile_num - 1) / tile_num;
+
+  // Each block handles one sequence and a tile of vocab
+  dim3 grid(num_seqs, tile_num);
+  dim3 block(std::min(tile_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
+        vllm::apply_repetition_penalties_kernel<scalar_t>
+            <<<grid, block, 0, stream>>>(
+                logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
+                output_mask.data_ptr<bool>(),
+                repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
+                tile_size);
+      });
+}
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4b1986916326fb6ee460b667599a2cd057e2740a..afaf021ec7179d81d8c16d32cef87faf726a1b4a 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -170,6 +170,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Apply repetition penalties to logits in-place
+  ops.def(
+      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
+      "Tensor output_mask, Tensor repetition_penalties) -> ()");
+  ops.impl("apply_repetition_penalties_", torch::kCUDA,
+           &apply_repetition_penalties_);
+
   // Layernorm-quant
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
 //   ops.def(
@@ -432,7 +439,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
       "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
       "               Tensor problem_sizes, Tensor a_strides, "
-      "               Tensor b_strides, Tensor c_strides) -> ()",
+      "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
+      "               bool per_out_ch) -> ()",
       {stride_tag});
   ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
 
@@ -447,10 +455,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
       "                        Tensor! input_permutation, "
       "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k) -> ()",
+      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
       {stride_tag});
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
+  // as an input, and computes expert_offsets (token start indices of each
+  // expert). In addition to this, it computes problem sizes for each expert's
+  // multiplication used by the two mms called from fused MoE operation.
+  ops.def(
+      "get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
+      "                             Tensor! problem_sizes1, "
+      "                             Tensor! problem_sizes2, "
+      "                             Tensor expert_num_tokens, "
+      "                             int num_local_experts, int padded_m, "
+      "                             int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
+           &get_cutlass_pplx_moe_mm_data);
+
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 5395b3884fb528bb01c5453d252a71aa181f6afb..6db2f307a3800a6ee7fc52cb322dc99e1dc66a5f 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
     --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
 
@@ -85,7 +86,7 @@ WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl
+    apt-get install -y --no-install-recommends vim numactl xz-utils
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -108,8 +109,11 @@ FROM base AS vllm-test
 WORKDIR /workspace/
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
-    uv pip install -r requirements/test.txt
+    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
+    cp requirements/test.in requirements/test-cpu.in && \
+    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
+    uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
+    uv pip install -r requirements/cpu-test.txt
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron
index 259dc5a23f78b4bd6d9666f11ca98a7d4303eee6..8bc23554718dcdf28a7b32778c182573620c961e 100644
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements/neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 53b8ccd804924da807e1e378b62eb8f9428840cf..8d43de77aad596ae7291c9e835d5e83a46dcafee 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -312,4 +312,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
 
+# Logging to confirm all the packages are installed
+RUN pip freeze
+
 #################### UNITTEST IMAGE #############################
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 14043eb7a8e3ba6ebca87b638e59525f83117d6f..aaff240388f2cdc5af70a2b1bf39430fb2ef7cc7 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -1,10 +1,41 @@
 ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 
+###############################################################
+# Stage to build openblas
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
+
+ARG MAX_JOBS
+ARG OPENBLAS_VERSION=0.3.29
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
+    && source /opt/rh/gcc-toolset-13/enable \
+    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
+    && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
+    && cd OpenBLAS-$OPENBLAS_VERSION \
+    &&  make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
+    && cd /tmp && touch control
+
+
+###############################################################
+# base stage with dependencies coming from centos mirrors
+###############################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
+RUN  microdnf install -y dnf && \ 
+     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+        dnf config-manager --set-enabled crb
+
+RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
+    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
+
+
 ###############################################################
 # base stage with basic dependencies
 ###############################################################
 
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
+FROM centos-deps-builder AS base-builder
 
 ARG PYTHON_VERSION=3.12
 ARG OPENBLAS_VERSION=0.3.29
@@ -20,25 +51,27 @@ ENV UV_LINK_MODE=copy
 # Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
 # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
 #       when `--jobs=<N>` is passed with podman build command
-RUN microdnf install -y openssl-devel dnf \
-    && dnf install -y  https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
-    && dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
+
+COPY --from=openblas-builder /tmp/control /dev/null
+
+RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
+    dnf install -y openssl-devel \
     && dnf install -y \
-       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
+       git tar gcc-toolset-13 automake libtool \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
-       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
-       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
-       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
+       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
+       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
        python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
     && dnf clean all \
+    && PREFIX=/usr/local make -C /openblas install \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv \
     && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
-    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && cd /tmp && touch control
 
+
 ###############################################################
 # Stage to build torch family
 ###############################################################
@@ -48,6 +81,8 @@ FROM base-builder AS torch-builder
 ARG MAX_JOBS
 ARG TORCH_VERSION=2.6.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
+ARG OPENBLAS_VERSION=0.3.29
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable &&  \
     git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
@@ -109,7 +144,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         .. && \
     make install -j ${MAX_JOBS:-$(nproc)} && \
     cd ../../python/ && \
-    uv pip install -v -r requirements-wheel-build.txt && \
+    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
+    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
     PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
     python setup.py build_ext \
     --build-type=release --bundle-arrow-cpp \
@@ -132,8 +168,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd opencv-python && \
     sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
     cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
+    uv pip install scikit-build && \    
     python -m build --wheel --installer=uv --outdir /opencvwheels/
 
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@@ -145,6 +198,7 @@ FROM base-builder AS vllmcache-builder
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@@ -160,11 +214,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,src=.,dst=/src/,rw \
     source /opt/rh/gcc-toolset-13/enable && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
     uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    make -C /numactl install && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
     uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@@ -173,21 +229,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install /vllmwheel/*.whl
 
 
-###############################################################
-# Stage to build numactl
-###############################################################
-
-FROM base-builder AS numa-builder
-
-# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
-# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
-ARG MAX_JOBS
-ARG NUMACTL_VERSION=2.0.19
-RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
-    && cd numactl \
-    && autoreconf -i && ./configure \
-    && make -j ${MAX_JOBS:-$(nproc)}
-
 ###############################################################
 # Stage to build lapack
 ###############################################################
@@ -217,6 +258,7 @@ ENV PATH=${VIRTUAL_ENV}/bin:$PATH
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
 ENV UV_LINK_MODE=copy
+ENV OMP_NUM_THREADS=16
 
 # create artificial dependencies between stages for independent stages to build in parallel
 COPY --from=torch-builder /tmp/control /dev/null
@@ -225,11 +267,13 @@ COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
+COPY --from=openblas-builder /tmp/control /dev/null
 
 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
     tar findutils openssl \
@@ -241,8 +285,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && microdnf clean all \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv --no-cache \
-    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && make -C /numactl install \
+    && PREFIX=/usr/local make -C /openblas install \
     && uv pip install 'cmake<4' \
     && cmake --install /lapack/build \
     && uv pip uninstall cmake
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index e60cf5e69a4c46d258859d7700e74a66201ab87d..4f40f32a39f26a5352b8806ba59382cc085fcbdc 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,7 +1,5 @@
 # default base image
 ARG REMOTE_VLLM="0"
-ARG USE_CYTHON="0"
-ARG BUILD_RPD="1"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base
 
@@ -15,7 +13,7 @@ RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl
 # Remove sccache    
-RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
+RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
@@ -30,18 +28,17 @@ ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
-	    && git checkout ${VLLM_BRANCH}
+	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
+	    && git checkout FETCH_HEAD
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-ARG USE_CYTHON
 # Build vLLM
 RUN cd vllm \
     && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
     && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
@@ -90,13 +87,6 @@ RUN case "$(which python3)" in \
         *) ;; esac
 
 RUN python3 -m pip install --upgrade huggingface-hub[cli]
-ARG BUILD_RPD
-RUN if [ ${BUILD_RPD} -eq "1" ]; then \
-    git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
-    && cd rocmProfileData/rpd_tracer \
-    && pip install -r requirements.txt && cd ../ \
-    && make && make install \
-    && cd hipMarker && python3 setup.py install ; fi
 
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
@@ -117,12 +107,6 @@ ENV TOKENIZERS_PARALLELISM=false
 # ENV that can improve safe tensor loading, and end-to-end time
 ENV SAFETENSORS_FAST_GPU=1
 
-# User-friendly environment setting for multi-processing to avoid below RuntimeError.
-# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
-# you must use the 'spawn' start method 
-# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
diff --git a/docs/.nav.yml b/docs/.nav.yml
index 42aba97753605dce7c5bcc7655aee9556a55e5f5..a9c594c291777a5a1b268ca3773a2ca3a29617cd 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -12,6 +12,7 @@ nav:
       - User Guide: usage/README.md
       - Developer Guide: contributing/README.md
       - API Reference: api/README.md
+      - CLI Reference: cli/README.md
     - Timeline:
       - Roadmap: https://roadmap.vllm.ai
       - Releases: https://github.com/vllm-project/vllm/releases
@@ -56,6 +57,8 @@ nav:
     - Contents:
       - glob: api/vllm/*
         preserve_directory_names: true
+  - CLI Reference:
+    - Summary: cli/README.md
   - Community:
     - community/*
     - Blog: https://blog.vllm.ai
diff --git a/docs/README.md b/docs/README.md
index 57b1d03deee281dcffb5740fed5a9c568a691853..0c6aff5fa07c30a14f30c8e14ccf31ce10715ae6 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -12,8 +12,8 @@
 <p style="text-align:center">
 <script async defer src="https://buttons.github.io/buttons.js"></script>
 <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
diff --git a/docs/cli/README.md b/docs/cli/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..df700fb743c0656dacf35ef96e859a7156712fc9
--- /dev/null
+++ b/docs/cli/README.md
@@ -0,0 +1,168 @@
+# vLLM CLI Guide
+
+The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
+
+```
+vllm --help
+```
+
+Available Commands:
+
+```
+vllm {chat,complete,serve,bench,collect-env,run-batch}
+```
+
+## serve
+
+Start the vLLM OpenAI Compatible API server.
+
+Examples:
+
+```bash
+# Start with a model
+vllm serve meta-llama/Llama-2-7b-hf
+
+# Specify the port
+vllm serve meta-llama/Llama-2-7b-hf --port 8100
+
+# Check with --help for more options
+# To list all groups
+vllm serve --help=listgroup
+
+# To view a argument group
+vllm serve --help=ModelConfig
+
+# To view a single argument
+vllm serve --help=max-num-seqs
+
+# To search by keyword
+vllm serve --help=max
+```
+
+## chat
+
+Generate chat completions via the running API server.
+
+Examples:
+
+```bash
+# Directly connect to localhost API without arguments
+vllm chat
+
+# Specify API url
+vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
+
+# Quick chat with a single prompt
+vllm chat --quick "hi"
+```
+
+## complete
+
+Generate text completions based on the given prompt via the running API server.
+
+Examples:
+
+```bash
+# Directly connect to localhost API without arguments
+vllm complete
+
+# Specify API url
+vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
+
+# Quick complete with a single prompt
+vllm complete --quick "The future of AI is"
+```
+
+## bench
+
+Run benchmark tests for latency online serving throughput and offline inference throughput.
+
+To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`.
+
+Available Commands:
+
+```bash
+vllm bench {latency, serve, throughput}
+```
+
+### latency
+
+Benchmark the latency of a single batch of requests.
+
+Example:
+
+```bash
+vllm bench latency \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --input-len 32 \
+    --output-len 1 \
+    --enforce-eager \
+    --load-format dummy
+```
+
+### serve
+
+Benchmark the online serving throughput.
+
+Example:
+
+```bash
+vllm bench serve \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --host server-host \
+    --port server-port \
+    --random-input-len 32 \
+    --random-output-len 4  \
+    --num-prompts  5
+```
+
+### throughput
+
+Benchmark offline inference throughput.
+
+Example:
+
+```bash
+vllm bench throughput \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --input-len 32 \
+    --output-len 1 \
+    --enforce-eager \
+    --load-format dummy
+```
+
+## collect-env
+
+Start collecting environment information.
+
+```bash
+vllm collect-env
+```
+
+## run-batch
+
+Run batch prompts and write results to file.
+
+Examples:
+
+```bash
+# Running with a local file
+vllm run-batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+
+# Using remote file
+vllm run-batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+## More Help
+
+For detailed options of any subcommand, use:
+
+```bash
+vllm <subcommand> --help
+```
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 2517436afcc11cd28713c75b80df317c81952b73..65ae9cc96367611f1b69a0c3791d88c3d5b49be8 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -29,20 +29,68 @@ See <gh-file:LICENSE>.
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.
 
-### Building the docs
+### Building the docs with MkDocs
 
-Install the dependencies:
+#### Introduction to MkDocs
+
+[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
+
+#### Install MkDocs and Plugins
+
+Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
 
 ```bash
 pip install -r requirements/docs.txt
 ```
 
-Start the autoreloading MkDocs server:
+!!! note
+    Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
+
+#### Verify Installation
+
+Confirm that MkDocs is correctly installed:
+
+```bash
+mkdocs --version
+```
+
+Example output:
+
+```console
+mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
+```
+
+#### Clone the `vLLM` repository
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+```
+
+#### Start the Development Server
+
+MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
 
 ```bash
 mkdocs serve
 ```
 
+Example output:
+
+```console
+INFO    -  Documentation built in 106.83 seconds
+INFO    -  [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
+INFO    -  [22:02:02] Serving on http://127.0.0.1:8000/
+```
+
+#### View in Your Browser
+
+Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
+
+#### Learn More
+
+For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
+
 ## Testing
 
 ```bash
@@ -60,6 +108,9 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files
 
 # Unit tests
 pytest tests/
+
+# Run tests for a single test file with detailed output
+pytest -s -v tests/test_logger.py
 ```
 
 !!! tip
diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
new file mode 100644
index 0000000000000000000000000000000000000000..7caaf10ceb5c8f37505ae4c160231b30e9aabed1
--- /dev/null
+++ b/docs/contributing/ci-failures.md
@@ -0,0 +1,118 @@
+# CI Failures
+
+What should I do when a CI job fails on my PR, but I don't think my PR caused
+the failure?
+
+- Check the dashboard of current CI test failures:  
+  👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
+
+- If your failure **is already listed**, it's likely unrelated to your PR.  
+  Help fixing it is always welcome!  
+    - Leave comments with links to additional instances of the failure.  
+    - React with a 👍 to signal how many are affected.
+
+- If your failure **is not listed**, you should **file an issue**.
+
+## Filing a CI Test Failure Issue
+
+- **File a bug report:**  
+    👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
+
+- **Use this title format:**
+  
+    ```
+    [CI Failure]: failing-test-job - regex/matching/failing:test
+    ```
+
+- **For the environment field:**
+  
+    ```
+ Still failing on main as of commit abcdef123
+    ```
+
+- **In the description, include failing tests:**
+  
+    ```
+    FAILED failing/test.py:failing_test1 - Failure description  
+     FAILED failing/test.py:failing_test2 - Failure description  
+    https://github.com/orgs/vllm-project/projects/20  
+    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml  
+    FAILED failing/test.py:failing_test3 - Failure description  
+    ```
+
+- **Attach logs** (collapsible section example):
+    <details>
+    <summary>Logs:</summary>
+
+    ```text
+    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data  
+    --- Logging error ---  
+    Traceback (most recent call last):  
+      File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model  
+        return self.model_executor.execute_model(scheduler_output)  
+    ...
+    FAILED failing/test.py:failing_test1 - Failure description  
+    FAILED failing/test.py:failing_test2 - Failure description  
+    FAILED failing/test.py:failing_test3 - Failure description  
+    ```
+  
+    </details>
+
+## Logs Wrangling
+
+Download the full log file from Buildkite locally.
+
+Strip timestamps and colorization:
+
+<gh-file:.buildkite/scripts/ci-clean-log.sh>
+
+```bash
+./ci-clean-log.sh ci.log
+```
+
+Use a tool [wl-clipboard](https://github.com/bugaevc/wl-clipboard) for quick copy-pasting:
+
+```bash
+tail -525 ci_build.log | wl-copy
+```
+
+## Investigating a CI Test Failure
+
+1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)  
+2. Bisect to find the first build that shows the issue.  
+3. Add your findings to the GitHub issue.  
+4. If you find a strong candidate PR, mention it in the issue and ping contributors.
+
+## Reproducing a Failure
+
+CI test failures may be flaky. Use a bash loop to run repeatedly:
+
+<gh-file:.buildkite/scripts/rerun-test.sh>
+
+```bash
+./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
+```
+
+## Submitting a PR
+
+If you submit a PR to fix a CI failure:
+
+- Link the PR to the issue:  
+  Add `Closes #12345` to the PR description.
+- Add the `ci-failure` label:  
+  This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
+
+## Other Resources
+
+- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability)
+- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main)
+
+## Daily Triage
+
+Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to:
+
+- Identify recent test failures **on `main`**.
+- Exclude legitimate test failures on PRs.
+- (Optional) Ignore tests with 0% reliability.
+
+Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20).
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 516640f6fd3c494dccbc8957c36451b5df1c46c2..93d9e80f5b012586838979ed04e13c74a9d56b96 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -46,11 +46,11 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
     create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
     ```Dockerfile
-    FROM vllm/vllm-openai:v0.8.3
+    FROM vllm/vllm-openai:v0.9.0
 
     # e.g. install the `audio` optional dependencies
     # NOTE: Make sure the version of vLLM matches the base image!
-    RUN uv pip install --system vllm[audio]==0.8.3
+    RUN uv pip install --system vllm[audio]==0.9.0
     ```
 
 !!! tip
@@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \
   -t vllm/vllm-gh200-openai:latest \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
   --build-arg vllm_fa_cmake_gpu_arches="90-real"
 ```
 
+!!! note
+    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
+
+    Run the following command on your host machine to register QEMU user static handlers:
+
+    ```console
+    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    ```
+
+    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
+
 ## Use the custom-built vLLM Docker image
 
 To run vLLM with the custom-built Docker image:
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
index 80242919ba5b3aaa77fb152d62e77fefc8dfd803..f0ff5c1d0e76da6336dee2d3578689b0f8d05cc8 100644
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -5,16 +5,6 @@ title: Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
-Table of contents:
-
-1. [Build Nginx Container][nginxloadbalancer-nginx-build]
-2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
-3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
-4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
-5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
-6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
-7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
-
 [](){ #nginxloadbalancer-nginx-build }
 
 ## Build Nginx Container
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 75d3e1b7ccc787e894446c2850b75582ea3c2baf..14720a392aafb7c49bbe188108b8688563b00b66 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -48,8 +48,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-More API details can be found in the [Offline Inference]
-(#offline-inference-api) section of the API docs.
+More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md
index 412c42fd580e5d33839105372bf5138b48abf722..4d58fae20f06ca329b3a127a47142d2138ca5a52 100644
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -22,13 +22,13 @@ This document describes how vLLM deals with these challenges.
 
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
-- `spawn` - spawn a new Python process. This will be the default as of Python
-  3.14. In macOS, this is already the default.
+- `spawn` - spawn a new Python process. The default on Windows and macOS.
 
-- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
-  in Python versions prior to 3.14.
+- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
+  Linux for Python versions prior to 3.14.
 
 - `forkserver` - Spawn a server process that will fork a new process on request.
+  The default on Linux for Python version 3.14 and newer.
 
 ### Tradeoffs
 
diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md
index ad041b0059f58a11ac86e136f9821510adace77d..e87e4c6a48b73a953cb40b9482dcb0202e911251 100644
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/v1/prefix_caching.md
@@ -104,7 +104,7 @@ class KVCacheBlock:
     block_id: int
     # The block hash (will be assigned when the block is full,
     # and will be reset when the block is evicted).
-    block_hash: BlockHashType
+    block_hash: BlockHash
     # The number of requests using this block now.
     ref_cnt: int
 
@@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is
 
 **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
 
-1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
+1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
    1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
    2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
    3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 77ceea49f173275383c5aa8e2a9abbe4bd0abe7c..5d448eb5c03d866fc1edae4866da8d25e496502c 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -10,6 +10,7 @@ The symbols used have the following meanings:
 - ✅ = Full compatibility
 - 🟠 = Partial compatibility
 - ❌ = No compatibility
+- ❔ = Unknown or TBD
 
 !!! note
     Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
@@ -36,23 +37,23 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature                                                   | [CP][chunked-prefill]   | [APC][automatic-prefix-caching]   | [LoRA][lora-adapter]   | <abbr title="Prompt Adapter">prmpt adptr</abbr>   | [SD][spec-decode]   | CUDA graph   | <abbr title="Pooling Models">pooling</abbr>   | <abbr title="Encoder-Decoder Models">enc-dec</abbr>   | <abbr title="Logprobs">logP</abbr>   | <abbr title="Prompt Logprobs">prmpt logP</abbr>   | <abbr title="Async Output Processing">async output</abbr>   | multi-step         | <abbr title="Multimodal Inputs">mm</abbr>   | best-of   | beam-search   |
-|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------|
-| [CP][chunked-prefill]                                     | ✅                       |                                   |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| [APC][automatic-prefix-caching]                           | ✅                       | ✅                                 |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| [LoRA][lora-adapter]                                      | ✅                       | ✅                                 | ✅                      |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| [SD][spec-decode]                                         | ✅                       | ✅                                 | ❌                      | ✅                                                 | ✅                   |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| CUDA graph                                                | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Pooling Models">pooling</abbr>               | ❌                       | ❌                                 | ❌                      | ❌                                                 | ❌                   | ❌            | ✅                                             |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ❌                       | [❌](gh-issue:7366)                | ❌                      | ❌                                                 | [❌](gh-issue:7366)  | ✅            | ✅                                             | ✅                                                     |                                      |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    |                                                   |                                                             |                    |                                             |           |               |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 |                                                             |                    |                                             |           |               |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                       | ✅                                 | ✅                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           |                    |                                             |           |               |
-| multi-step                                                | ❌                       | ✅                                 | ❌                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           | ✅                  |                                             |           |               |
-| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                       | [🟠](gh-pr:8348)                   | [🟠](gh-pr:4194)        | ❔                                                 | ❔                   | ✅            | ✅                                             | ✅                                                     | ✅                                    | ✅                                                 | ✅                                                           | ❔                  | ✅                                           |           |               |
-| best-of                                                   | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ✅                                           | ✅         |               |
-| beam-search                                               | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ❔                                           | ✅         | ✅             |
+| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
+| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
+| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
+| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
+| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
+| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
+| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
+| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
+| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
+| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
 
 [](){ #feature-x-hardware }
 
@@ -75,3 +76,6 @@ th:not(:first-child) {
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+
+!!! note
+    Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 642462f7c45576c1f6f291fc7b1a5d4939dbd397..04e92dbc459247e2b8a6dabebee9166141d11841 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -165,6 +165,7 @@ it will first look in the local directory for a directory `foobar`, and attempt
 that adapter will then be available for normal use on the server.
 
 Alternatively, follow these example steps to implement your own plugin:
+
 1. Implement the LoRAResolver interface.
 
     Example of a simple S3 LoRAResolver implementation:
@@ -198,9 +199,9 @@ Alternatively, follow these example steps to implement your own plugin:
             return lora_request
     ```
 
-2. Register LoRAResolver plugin.
+2. Register `LoRAResolver` plugin.
 
-     ```python
+    ```python
     from vllm.lora.resolver import LoRAResolverRegistry
 
     s3_resolver = S3LoRAResolver()
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index 2967bf9c7504af7af7e9636f81d30e026e38a892..6a585b1ccb2caedd2027c4ce85667b9c2a541364 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -5,13 +5,13 @@ title: Supported Hardware
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Inferentia   | Google TPU   |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Neuron   | Google TPU   |
 |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
 | AWQ                   | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
 | GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
 | Marlin (GPTQ/AWQ/FP8) | ❌       | ❌        | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
-| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ❌                | ✅︎           |
-| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ❌                | ❌            |
+| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ✅︎                | ✅︎           |
+| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ✅︎                | ❌            |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
 | AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 6ee1060dd050af0a6f77eed0f231833a9e15f061..3547069f724dc8e20c657e010e0975b30ac5f6dc 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -238,9 +238,11 @@ Flags: `--tool-call-parser hermes`
 ### DeepSeek-V3 Models (`deepseek_v3`)
 
 Supported models:
-* `deepseek-ai/DeepSeek-V3-0324`
 
-Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja`
+* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
+* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+
+Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 
 ### Models with Pythonic Tool Calls (`pythonic`)
 
diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
index f08c78fba6c85b5337cd4df64a19ac6d7385b981..86c12472fb3609a4ae81b204d162866bb7bb25b7 100644
--- a/docs/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -1,8 +1,9 @@
 # --8<-- [start:installation]
 
-vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
-Paged Attention and Chunked Prefill are currently in development and will be available soon.
-Data types currently supported in Neuron SDK are FP16 and BF16.
+[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and 
+    generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, 
+    and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. 
+    This tab describes how to set up your environment to run vLLM on Neuron.
 
 !!! warning
     There are no pre-built wheels or images for this device, so you must build vLLM from source.
@@ -11,58 +12,30 @@ Data types currently supported in Neuron SDK are FP16 and BF16.
 # --8<-- [start:requirements]
 
 - OS: Linux
-- Python: 3.9 -- 3.11
-- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
-- Pytorch 2.0.1/2.1.1
-- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+- Python: 3.9 or newer
+- Pytorch 2.5/2.6
+- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
+- AWS Neuron SDK 2.23
 
 ## Configure a new environment
 
-### Launch Trn1/Inf2 instances
+### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
 
-Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
+The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this 
+[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
 
-- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
-- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
-- Select Ubuntu Server 22.04 TLS AMI
-- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
 - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
-
-### Install drivers and tools
-
-The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
-
+- Once inside your instance, activate the pre-installed virtual environment for inference by running
 ```console
-# Configure Linux for Neuron repository updates
-. /etc/os-release
-sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
-deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
-EOF
-wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \
-    | sudo apt-key add -
-
-# Update OS packages
-sudo apt-get update -y
-
-# Install OS headers
-sudo apt-get install linux-headers-$(uname -r) -y
-
-# Install git
-sudo apt-get install git -y
-
-# install Neuron Driver
-sudo apt-get install aws-neuronx-dkms=2.* -y
-
-# Install Neuron Runtime
-sudo apt-get install aws-neuronx-collectives=2.* -y
-sudo apt-get install aws-neuronx-runtime-lib=2.* -y
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
 
-# Install Neuron Tools
-sudo apt-get install aws-neuronx-tools=2.* -y
+Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) 
+for alternative setup instructions including using Docker and manually installing dependencies.
 
-# Add PATH
-export PATH=/opt/aws/neuron/bin:$PATH
-```
+!!! note
+    NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) 
+    library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).  
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -75,60 +48,37 @@ Currently, there are no pre-built Neuron wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-!!! note
-    The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-#### Install transformers-neuronx and its dependencies
+#### Install vLLM from source
 
-[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
-Follow the steps below to install transformer-neuronx package and its dependencies.
+Install vllm as follows:
 
 ```console
-# Install Python venv
-sudo apt-get install -y python3.10-venv g++
-
-# Create Python venv
-python3.10 -m venv aws_neuron_venv_pytorch
-
-# Activate Python venv
-source aws_neuron_venv_pytorch/bin/activate
-
-# Install Jupyter notebook kernel
-pip install ipykernel
-python3.10 -m ipykernel install \
-    --user \
-    --name aws_neuron_venv_pytorch \
-    --display-name "Python (torch-neuronx)"
-pip install jupyter notebook
-pip install environment_kernels
-
-# Set pip repository pointing to the Neuron repository
-python -m pip config set \
-    global.extra-index-url \
-    https://pip.repos.neuron.amazonaws.com
-
-# Install wget, awscli
-python -m pip install wget
-python -m pip install awscli
-
-# Update Neuron Compiler and Framework
-python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -U -r requirements/neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install -e .
 ```
 
-#### Install vLLM from source
+AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at 
+    [https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's 
+    available on vLLM V0. Please utilize the AWS Fork for the following features:
+
+- Llama-3.2 multi-modal support
+- Multi-node distributed inference 
 
-Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
+Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) 
+    for more details and usage examples.
+
+To install the AWS Neuron fork, run the following:
 
 ```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -U -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install .
+git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
+cd upstreaming-to-vllm
+pip install -r requirements/neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install -e .
 ```
 
-If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
+Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
 
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:set-up-using-docker]
@@ -148,5 +98,57 @@ Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dock
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 
-There is no extra information for this device.
+[](){ #feature-support-through-nxd-inference-backend }
+### Feature support through NxD Inference backend
+
+The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend 
+    to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most 
+    [features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. 
+
+To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override 
+as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
+```console
+override_neuron_config={
+    "enable_bucketing":False,
+}
+```
+or when launching vLLM from the CLI, pass
+```console
+--override-neuron-config "{\"enable_bucketing\":false}"
+```
+
+Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts 
+(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. 
+
+### Known limitations
+
+- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
+    [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
+    for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
+- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this 
+    [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) 
+    to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
+- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at 
+    runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
+- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
+    to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
+- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
+    to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
+    to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
+- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches 
+    max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt 
+    to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support 
+    for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is 
+    implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
+
+
+### Environment variables
+- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid 
+    compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
+    artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
+    but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+    under this specified path.
+- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
+- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
+
 # --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 18c96b264ad82fdb2c17ece299cf253ecfaf6147..00bb5cae43f008eb94fb4dd77a77b802d20bff5f 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -110,8 +110,9 @@ vLLM CPU backend supports the following vLLM features:
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`.
+- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 
 ## Performance tips
@@ -133,7 +134,15 @@ export VLLM_CPU_OMP_THREADS_BIND=0-29
 vllm serve facebook/opt-125m
 ```
 
-- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+ or using default auto thread binding:
+
+```console
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_NUM_OF_RESERVED_CPU=2
+vllm serve facebook/opt-125m
+```
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
 ```console
 $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
@@ -178,6 +187,12 @@ $ python examples/offline_inference/basic/basic.py
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
+    or using default auto thread binding:
+
+    ```console
+    VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```
+
   - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
 
   - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
index 7d6472afa7ea71bc6e706946a2daf1087c5a57b6..7ddadccb1b4f1de09d04a9948618c3344019ccdd 100644
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -17,7 +17,7 @@ Third, install Python packages for vLLM CPU backend building:
 
 ```console
 pip install --upgrade pip
-pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
+pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy
 pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 3c983f600673d8d75af9ee9d8884568df99283a8..f8a3acef784fccefc99073208bd032b3b19739bc 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -19,6 +19,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 - OS: Linux
 - Python: 3.9 -- 3.12
 
+!!! note
+    vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows).
+
 === "NVIDIA CUDA"
 
     --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 0029b3a244968adeefc8b7c43547bc5c0b96b4e4..8b7dc6dd09d344284ebf1d9d72263a6da56f181e 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -179,8 +179,6 @@ It is important that the user kicks off the docker build using buildkit. Either
 It provides flexibility to customize the build of docker image using the following arguments:
 
 - `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
-- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
-- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6f290efe45c2f112ad03d2924e22c9ba2a120c15..7cfc89605150e4dce4b1653bb4dc20f2eff0d976 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from dataclasses import dataclass, field
 from pathlib import Path
diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
index e5f8549d838370576846e3948738c14ac381c453..1a84039abc14f3c4db6245a62fd2fb06a49cf971 100644
--- a/docs/mkdocs/hooks/remove_announcement.py
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from pathlib import Path
 from typing import Literal
 
 
@@ -7,10 +9,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
     if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
         # remove the warning banner if the version is a tagged release
-        docs_dir = os.path.dirname(__file__)
-        announcement_path = os.path.join(docs_dir,
-                                         "mkdocs/overrides/main.html")
+        mkdocs_dir = Path(__file__).parent.parent
+        announcement_path = mkdocs_dir / "overrides/main.html"
         # The file might be removed already if the build is triggered multiple
         # times (readthedocs build both HTML and PDF versions separately)
-        if os.path.exists(announcement_path):
+        if announcement_path.exists():
             os.remove(announcement_path)
diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
index c738828085ba75046b5f5909a8f4edf3b8d45fa2..6484581ed94782f1acccf32fdbf1a7abe3e1397f 100644
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import regex as re
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.files import Files
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index b6feb405c6cac8c38158699869afcdefba727f1e..e0b4479c0beb67bc0e7cc311f454b6b686dac850 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html).
 
 !!! note
     Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7594c6e6fbf1ea7b406d35b29f3aa4e8afa0293f..a8a6f3417e546316b63eab2227bef0a5ffa1fb23 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -302,31 +302,31 @@ Specified using `--task generate`.
 | Architecture                                      | Models                                              | Example HF Models                                                                                                                                                            | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
 |---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
 | `AquilaForCausalLM`                               | Aquila, Aquila2                                     | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `ArcticForCausalLM`                               | Arctic                                              | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.                                                                                               | ✅︎                     |                             |
+| `ArcticForCausalLM`                               | Arctic                                              | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.                                                                                               |                        | ✅︎                          |
 | `BaiChuanForCausalLM`                             | Baichuan2, Baichuan                                 | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.                                                                                                          | ✅︎                     | ✅︎                          |
-| `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   |                        |                             |
-| `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                | ✅︎                     |                             |
+| `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   | ✅︎                     | ✅︎                          |
+| `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                |                        | ✅︎                          |
 | `BartForConditionalGeneration`                    | BART                                                | `facebook/bart-base`, `facebook/bart-large-cnn`, etc.                                                                                                                        |                        |                             |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM                                             | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.                                                                                                       | ✅︎                     | ✅︎                          |
 | `CohereForCausalLM`, `Cohere2ForCausalLM`         | Command-R                                           | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.                                                                                               | ✅︎                     | ✅︎                          |
-| `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     | ✅︎                     |                             |
-| `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     |                             |
-| `DeepseekForCausalLM`                             | DeepSeek                                            | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.                                                                                                 | ✅︎                     |                             |
-| `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               | ✅︎                     |                             |
-| `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               | ✅︎                     |                             |
+| `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     |                        | ✅︎                          |
+| `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     | ✅︎                          |
+| `DeepseekForCausalLM`                             | DeepSeek                                            | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.                                                                                                 |                        | ✅︎                          |
+| `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               |                        | ✅︎                          |
+| `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               |                        | ✅︎                          |
 | `ExaoneForCausalLM`                               | EXAONE-3                                            | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         | ✅︎                     |                             |
-| `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            | ✅︎                     | ✅︎                          |
+| `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         |                        | ✅︎                          |
+| `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            |                        | ✅︎                          |
 | `FalconH1ForCausalLM`                             | Falcon-H1                                           | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc.                                                                                                           | ✅︎                     | ✅︎                          |
 | `GemmaForCausalLM`                                | Gemma                                               | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.                                                                                                                            | ✅︎                     | ✅︎                          |
 | `Gemma2ForCausalLM`                               | Gemma 2                                             | `google/gemma-2-9b`, `google/gemma-2-27b`, etc.                                                                                                                              | ✅︎                     | ✅︎                          |
 | `Gemma3ForCausalLM`                               | Gemma 3                                             | `google/gemma-3-1b-it`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
 | `GlmForCausalLM`                                  | GLM-4                                               | `THUDM/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          |
 | `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `THUDM/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      | ✅︎                     |                             |
+| `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      |                        | ✅︎                          |
 | `GPTBigCodeForCausalLM`                           | StarCoder, SantaCoder, WizardCoder                  | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.                                                                                 | ✅︎                     | ✅︎                          |
-| `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            | ✅︎                     |                             |
-| `GPTNeoXForCausalLM`                              | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | ✅︎                     |                             |
+| `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            |                        | ✅︎                          |
+| `GPTNeoXForCausalLM`                              | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. |                        | ✅︎                          |
 | `GraniteForCausalLM`                              | Granite 3.0, Granite 3.1, PowerLM                   | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.                                                                             | ✅︎                     | ✅︎                          |
 | `GraniteMoeForCausalLM`                           | Granite 3.0 MoE, PowerMoE                           | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.                                                                | ✅︎                     | ✅︎                          |
 | `GraniteMoeHybridForCausalLM`                     | Granite 4.0 MoE Hybrid                              | `ibm-granite/granite-4.0-tiny-preview`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
@@ -336,39 +336,40 @@ Specified using `--task generate`.
 | `InternLMForCausalLM`                             | InternLM                                            | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.                                                                                                                    | ✅︎                     | ✅︎                          |
 | `InternLM2ForCausalLM`                            | InternLM2                                           | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.                                                                                                                  | ✅︎                     | ✅︎                          |
 | `InternLM3ForCausalLM`                            | InternLM3                                           | `internlm/internlm3-8b-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          |
-| `JAISLMHeadModel`                                 | Jais                                                | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.                                                         | ✅︎                     |                             |
+| `JAISLMHeadModel`                                 | Jais                                                | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.                                                         |                        | ✅︎                          |
 | `JambaForCausalLM`                                | Jamba                                               | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.                                                                                 | ✅︎                     | ✅︎                          |
 | `LlamaForCausalLM`                                | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi              | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.        | ✅︎                     | ✅︎                          |
-| `MambaForCausalLM`                                | Mamba                                               | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.                                                                               | ✅︎                     |                             |
+| `MambaForCausalLM`                                | Mamba                                               | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.                                                                               |                        | ✅︎                          |
 | `MiniCPMForCausalLM`                              | MiniCPM                                             | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.                                                                               | ✅︎                     | ✅︎                          |
 | `MiniCPM3ForCausalLM`                             | MiniCPM3                                            | `openbmb/MiniCPM3-4B`, etc.                                                                                                                                                  | ✅︎                     | ✅︎                          |
 | `MistralForCausalLM`                              | Mistral, Mistral-Instruct                           | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.                                                                                                      | ✅︎                     | ✅︎                          |
 | `MixtralForCausalLM`                              | Mixtral-8x7B, Mixtral-8x7B-Instruct                 | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.                                                          | ✅︎                     | ✅︎                          |
-| `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   | ✅︎                     |                             |
+| `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   |                        | ✅︎                          |
 | `NemotronForCausalLM`                             | Nemotron-3, Nemotron-4, Minitron                    | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.                                                                                                         | ✅︎                     | ✅︎                          |
-| `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             | ✅︎                     |                             |
-| `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               | ✅︎                     |                             |
-| `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        | ✅︎                     | ✅︎                          |
-| `OPTForCausalLM`                                  | OPT, OPT-IML                                        | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.                                                                                                                         | ✅︎                     |                             |
-| `OrionForCausalLM`                                | Orion                                               | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.                                                                                                             | ✅︎                     |                             |
+| `NemotronHForCausalLM`                            | Nemotron-H                                          | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc.                                                                       | ✅︎                     | ✅︎                          |
+| `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             |                        | ✅︎                          |
+| `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               |                        | ✅︎                          |
+| `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        |                        | ✅︎                          |
+| `OPTForCausalLM`                                  | OPT, OPT-IML                                        | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.                                                                                                                         |                        | ✅︎                          |
+| `OrionForCausalLM`                                | Orion                                               | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.                                                                                                             |                        | ✅︎                          |
 | `PhiForCausalLM`                                  | Phi                                                 | `microsoft/phi-1_5`, `microsoft/phi-2`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
 | `Phi3ForCausalLM`                                 | Phi-4, Phi-3                                        | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.   | ✅︎                     | ✅︎                          |
-| `Phi3SmallForCausalLM`                            | Phi-3-Small                                         | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.                                                                                             | ✅︎                     |                             |
+| `Phi3SmallForCausalLM`                            | Phi-3-Small                                         | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.                                                                                             |                        | ✅︎                          |
 | `PhiMoEForCausalLM`                               | Phi-3.5-MoE                                         | `microsoft/Phi-3.5-MoE-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          |
-| `PersimmonForCausalLM`                            | Persimmon                                           | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.                                                                                                                   | ✅︎                     |                             |
+| `PersimmonForCausalLM`                            | Persimmon                                           | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.                                                                                                                   |                        | ✅︎                          |
 | `Plamo2ForCausalLM`                               | PLaMo2                                              | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.                                                                                                                                 |                        |                             |
 | `QWenLMHeadModel`                                 | Qwen                                                | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.                                                                                                                                    | ✅︎                     | ✅︎                          |
 | `Qwen2ForCausalLM`                                | QwQ, Qwen2                                          | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.                                                                                                      | ✅︎                     | ✅︎                          |
-| `Qwen2MoeForCausalLM`                             | Qwen2MoE                                            | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.                                                                                                                | ✅︎                     |                             |
+| `Qwen2MoeForCausalLM`                             | Qwen2MoE                                            | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.                                                                                                                |                        | ✅︎                          |
 | `Qwen3ForCausalLM`                                | Qwen3                                               | `Qwen/Qwen3-8B`, etc.                                                                                                                                                        | ✅︎                     | ✅︎                          |
-| `Qwen3MoeForCausalLM`                             | Qwen3MoE                                            | `Qwen/Qwen3-30B-A3B`, etc.                                                                                                                                                   | ✅︎                     |                             |
-| `StableLmForCausalLM`                             | StableLM                                            | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.                                                                                                | ✅︎                     |                             |
-| `Starcoder2ForCausalLM`                           | Starcoder2                                          | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.                                                                                             | ✅︎                     |                             |
+| `Qwen3MoeForCausalLM`                             | Qwen3MoE                                            | `Qwen/Qwen3-30B-A3B`, etc.                                                                                                                                                   |                        | ✅︎                          |
+| `StableLmForCausalLM`                             | StableLM                                            | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.                                                                                                |                        |                             |
+| `Starcoder2ForCausalLM`                           | Starcoder2                                          | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.                                                                                             |                        | ✅︎                          |
 | `SolarForCausalLM`                                | Solar Pro                                           | `upstage/solar-pro-preview-instruct`, etc.                                                                                                                                   | ✅︎                     | ✅︎                          |
 | `TeleChat2ForCausalLM`                            | TeleChat2                                           | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.                                                                                                | ✅︎                     | ✅︎                          |
 | `TeleFLMForCausalLM`                              | TeleFLM                                             | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.                                                                                                                    | ✅︎                     | ✅︎                          |
 | `XverseForCausalLM`                               | XVERSE                                              | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.                                                                                            | ✅︎                     | ✅︎                          |
-| `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            | ✅︎                     |                             |
+| `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            |                        |                             |
 | `Zamba2ForCausalLM`                               | Zamba2                                              | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.                                                                              |                        |                             |
 
 !!! note
@@ -401,7 +402,7 @@ Specified using `--task embed`.
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-    You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
+    You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
 
 !!! note
     For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
@@ -512,44 +513,45 @@ Specified using `--task generate`.
 
 | Architecture                                 | Models                                                                   | Inputs                                                                | Example HF Models                                                                                                                                       | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `AriaForConditionalGeneration`               | Aria                                                                     | T + I<sup>+</sup>                                                     | `rhymes-ai/Aria`                                                                                                                                        | ✅︎                     | ✅︎                          |                       |
-| `AyaVisionForConditionalGeneration`          | Aya Vision                                                               | T + I<sup>+</sup>                                                     | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.                                                                                         | ✅︎                     | ✅︎                          |                       |
-| `Blip2ForConditionalGeneration`              | BLIP-2                                                                   | T + I<sup>E</sup>                                                     | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.                                                                                          | ✅︎                     | ✅︎                          |                       |
-| `ChameleonForConditionalGeneration`          | Chameleon                                                                | T + I                                                                 | `facebook/chameleon-7b` etc.                                                                                                                            | ✅︎                     | ✅︎                          |                       |
-| `DeepseekVLV2ForCausalLM`<sup>^</sup>        | DeepSeek-VL2                                                             | T + I<sup>+</sup>                                                     | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.                                                      | ✅︎                     | ✅︎                          |                       |
+| `AriaForConditionalGeneration`               | Aria                                                                     | T + I<sup>+</sup>                                                     | `rhymes-ai/Aria`                                                                                                                                        |                        |                            | ✅︎                       |
+| `AyaVisionForConditionalGeneration`          | Aya Vision                                                               | T + I<sup>+</sup>                                                     | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.                                                                                         |                        | ✅︎                          | ✅︎                       |
+| `Blip2ForConditionalGeneration`              | BLIP-2                                                                   | T + I<sup>E</sup>                                                     | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.                                                                                          |                        | ✅︎                          | ✅︎                      |
+| `ChameleonForConditionalGeneration`          | Chameleon                                                                | T + I                                                                 | `facebook/chameleon-7b` etc.                                                                                                                            |                      | ✅︎                            | ✅︎                      |
+| `DeepseekVLV2ForCausalLM`<sup>^</sup>        | DeepSeek-VL2                                                             | T + I<sup>+</sup>                                                     | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.                                                      |                       | ✅︎                           | ✅︎                      |
 | `Florence2ForConditionalGeneration`          | Florence-2                                                               | T + I                                                                 | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.                                                                                          |                        |                             |                       |
-| `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    | ✅︎                     | ✅︎                          |                       |
+| `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    |                       | ✅︎                           | ✅︎                     |
 | `Gemma3ForConditionalGeneration`             | Gemma 3                                                                  | T + I<sup>+</sup>                                                     | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ⚠️                    |
 | `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
 | `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
-| `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      | ✅︎                     | ✅︎\*                        |                       |
-| `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     | ✅︎                          |                       |
-| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎                     | ✅︎                          |                       |
-| `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    | ✅︎                     |                             |                       |
-| `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎                     | ✅︎                          |                       |
-| `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        | ✅︎                     | ✅︎                          |                       |
-| `LlavaNextForConditionalGeneration`          | LLaVA-NeXT                                                               | T + I<sup>E+</sup>                                                    | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.                                                                           | ✅︎                     | ✅︎                          |                       |
-| `LlavaNextVideoForConditionalGeneration`     | LLaVA-NeXT-Video                                                         | T + V                                                                 | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.                                                                                                                 | ✅︎                     | ✅︎                          |                       |
-| `LlavaOnevisionForConditionalGeneration`     | LLaVA-Onevision                                                          | T + I<sup>+</sup> + V<sup>+</sup>                                     | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.                                                            | ✅︎                     | ✅︎                          |                       |
+| `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                       | ✅︎                          | ✅︎\*                     |
+| `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                           |  ✅︎                     |
+| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎               | ✅︎                          | ✅︎                      |
+| `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    |                      |                             | ✅︎                       |
+| `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. |                       | ✅︎                          | ✅︎                      |
+| `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        |                       | ✅︎                          | ✅︎                      |
+| `LlavaNextForConditionalGeneration`          | LLaVA-NeXT                                                               | T + I<sup>E+</sup>                                                    | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.                                                                           |                       | ✅︎                          | ✅︎                      |
+| `LlavaNextVideoForConditionalGeneration`     | LLaVA-NeXT-Video                                                         | T + V                                                                 | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.                                                                                                                 |                       | ✅︎                          | ✅︎                      |
+| `LlavaOnevisionForConditionalGeneration`     | LLaVA-Onevision                                                          | T + I<sup>+</sup> + V<sup>+</sup>                                     | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.                                                            |                       | ✅︎                          | ✅︎                      |
 | `MiniCPMO`                                   | MiniCPM-O                                                                | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>                  | `openbmb/MiniCPM-o-2_6`, etc.                                                                                                                           | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MiniCPMV`                                   | MiniCPM-V                                                                | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.                                                         | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MiniMaxVL01ForConditionalGeneration`        | MiniMax-VL                                                               | T + I<sup>E+</sup>                                                    | `MiniMaxAI/MiniMax-VL-01`, etc.                                                                                                                         | ✅︎                     | ✅︎                          |                       |
+| `MiniCPMV`                                   | MiniCPM-V                                                                | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.                                                         | ✅︎                     |                            | ✅︎                    |
+| `MiniMaxVL01ForConditionalGeneration`        | MiniMax-VL                                                               | T + I<sup>E+</sup>                                                    | `MiniMaxAI/MiniMax-VL-01`, etc.                                                                                                                         |                       | ✅︎                          |                       |
 | `Mistral3ForConditionalGeneration`           | Mistral3                                                                 | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                       |
+| `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                        |
 | `MolmoForCausalLM`                           | Molmo                                                                    | T + I<sup>+</sup>                                                     | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.                                                                                              | ✅︎                     | ✅︎                          | ✅︎                    |
-| `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               | ✅︎                     | ✅︎                          |                       |
-| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 | ✅︎                     |                             |                       |
-| `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  | ✅︎                     | ⚠️                          |                       |
-| `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       | ✅︎                     | ✅︎                          |                       |
-| `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          |                       |
-| `PixtralForConditionalGeneration`            | Pixtral                                                                  | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.                                                                  | ✅︎                     | ✅︎                          |                       |
+| `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               |                       | ✅︎                          | ✅︎                      |
+| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 |                       | ✅︎                          | ✅︎                      |
+| `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  |                       | ✅︎                          | ⚠️                       |
+| `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       |                      | ✅︎                          |  ✅︎                     |
+| `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          | ✅︎                       |
+| `PixtralForConditionalGeneration`            | Pixtral                                                                  | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.                                                                  |                       | ✅︎                          | ✅︎                      |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL                                                                  | T + I<sup>E+</sup>                                                    | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2AudioForConditionalGeneration`         | Qwen2-Audio                                                              | T + A<sup>+</sup>                                                     | `Qwen/Qwen2-Audio-7B-Instruct`                                                                                                                          | ✅︎                     | ✅︎                          |                       |
+| `Qwen2AudioForConditionalGeneration`         | Qwen2-Audio                                                              | T + A<sup>+</sup>                                                     | `Qwen/Qwen2-Audio-7B-Instruct`                                                                                                                          |                       | ✅︎                          | ✅︎                      |
 | `Qwen2VLForConditionalGeneration`            | QVQ, Qwen2-VL                                                            | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.                                                                 | ✅︎                     | ✅︎                          | ✅︎                    |
 | `Qwen2_5_VLForConditionalGeneration`         | Qwen2.5-VL                                                               | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  | ✅︎                     | ✅︎\*                        |                       |
-| `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               | ✅︎                     | ✅︎                          |                       |
-| `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     | ✅︎                          |                       |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  |                      | ✅︎                         | ✅︎\*                     |
+| `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                       | ✅︎                          | ✅︎                      |
+| `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                           | ✅︎                      |
+| `TarsierForConditionalGeneration`                | Tarsier                                                                  | T + I<sup>E+</sup>                                                                                                     | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b`                                                                                                           |                      | ✅︎                      | ✅︎                   |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
@@ -647,7 +649,7 @@ The following table lists those that are tested in vLLM.
 
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|
-| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based   | T / I    | `royokong/e5-v`          | ✅︎                     |                             |
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based   | T / I    | `royokong/e5-v`          |                       |                             |
 | `Phi3VForCausalLM`                  | Phi-3-Vision-based | T + I    | `TIGER-Lab/VLM2Vec-Full` | 🚧                      | ✅︎                          |
 
 #### Transcription
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
index 9ad7253184d9d86fa655dae3e71c2705176ebbbd..6603aa83b4af71825362994143ce24a5a9fad6df 100644
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -35,19 +35,6 @@ The following metrics are exposed:
 --8<-- "vllm/engine/metrics.py:metrics-definitions"
 ```
 
-The following metrics are deprecated and due to be removed in a future version:
-
-- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
-  `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
-  used in V1.
-- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
-  counters in V1.
-- `vllm:time_in_queue_requests` because it duplicates
-  `vllm:request_queue_time_seconds`.
-- `vllm:model_forward_time_milliseconds` and
-  `vllm:model_execute_time_milliseconds` because
-  prefill/decode/inference time metrics should be used instead.
-
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
 and are then removed in version `X.Y+2`.
diff --git a/docs/usage/security.md b/docs/usage/security.md
index f1661828d68a4f3e01e93e9ed47f5bb91768275e..76140434dcb361129a5075306d10caadda385781 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -12,14 +12,14 @@ All communications between nodes in a multi-node vLLM deployment are **insecure
 
 The following options control inter-node communications in vLLM:
 
-1. **Environment Variables:**
+#### 1. **Environment Variables:**
    - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
 
-2. **KV Cache Transfer Configuration:**
+#### 2. **KV Cache Transfer Configuration:**
    - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
    - `--kv-port`: The port for KV cache transfer communications (default: 14579)
 
-3. **Data Parallel Configuration:**
+#### 3. **Data Parallel Configuration:**
    - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
    - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
 
@@ -31,6 +31,7 @@ refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
 
 Key points from the PyTorch security guide:
+
 - PyTorch Distributed features are intended for internal communication only
 - They are not built for use in untrusted environments or networks
 - No authorization protocol is included for performance reasons
@@ -39,16 +40,16 @@ Key points from the PyTorch security guide:
 
 ### Security Recommendations
 
-1. **Network Isolation:**
+#### 1. **Network Isolation:**
    - Deploy vLLM nodes on a dedicated, isolated network
    - Use network segmentation to prevent unauthorized access
    - Implement appropriate firewall rules
 
-2. **Configuration Best Practices:**
+#### 2. **Configuration Best Practices:**
    - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
    - Configure firewalls to only allow necessary ports between nodes
 
-3. **Access Control:**
+#### 3. **Access Control:**
    - Restrict physical and network access to the deployment environment
    - Implement proper authentication and authorization for management interfaces
    - Follow the principle of least privilege for all system components
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 889cfccdacac60008732576c30d2b55fbbac4c95..e9ab425a1d063e396e22a64ab4c17ccc6f87b42b 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -40,7 +40,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
-- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
 
 ## Incorrect network setup
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 3d5d7ce45cce4b4ffc0dbdad700a582d6c3d2d5c..baeb5411bcfdf885ee23d2c8b257c277e02fbb54 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -1,5 +1,7 @@
 # vLLM V1
 
+**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.**
+
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
 To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
@@ -38,6 +40,8 @@ This living user guide outlines a few known **important changes and limitations*
 | **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
 | **AMD**    | <nobr>🚧 WIP</nobr>           |
 | **TPU**    | <nobr>🚧 WIP</nobr>           |
+| **CPU**    | <nobr>🚧 WIP</nobr>           |
+
 #### Feature / Model
 
 | Feature / Model | Status |
@@ -51,9 +55,9 @@ This living user guide outlines a few known **important changes and limitations*
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
-| **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
+| **Embedding Models**                        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
-| **Encoder-Decoder Models**                  | <nobr>🟡 Planned</nobr>                                                           |
+| **Encoder-Decoder Models**                  | <nobr>🟠 Delayed</nobr>                                                           |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
 | **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
 | **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
@@ -63,10 +67,11 @@ This living user guide outlines a few known **important changes and limitations*
 - **🟢 Functional**: Fully operational, with ongoing optimizations.  
 - **🚧 WIP**: Under active development.  
 - **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
-- **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
+- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
+- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
 
 **Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
-way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
+way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
 allocate a fixed token budget per request, enabling features like chunked prefills,
 prefix caching, and speculative decoding without a strict separation between prefill
 and decode phases.
@@ -140,7 +145,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
 and the majority fall into the following categories. V1 support for these models will be added eventually.
 
 **Embedding Models**  
-Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
+The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
+
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1.
 
 **Mamba Models**  
 Models using selective state-space mechanisms (instead of standard transformer attention)
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 56cdd6861baa49ce1eeafaebd5f5df4a39ad7509..8e5cac78a4b200771ba536ce45cbaef70919d5c0 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
 with the correct prompt format on audio language models.
diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py
index 0d8c733042376861d8a09c8a92b1cd8a6f67f063..a01a9565a5fdec37d1466ea296c0125d5d2e9dfe 100644
--- a/examples/offline_inference/automatic_prefix_caching.py
+++ b/examples/offline_inference/automatic_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstration script for Automatic Prefix Caching (APC) in vLLM.
 
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index ae5ae7cb483461953959668a7da1d465e130cda6..78bfda9bcf4e3c5b8fbe7843e4be8b5b10dcec24 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index b0bb5aa71b8a7113c846571fdbab2f8f6f106900..d078c517d00e79f736887c83a4cc3e579145b325 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 40ccb1294e42486eacd2026f7e4942f6f8e59e6f..219064e97429b9abbd43f70b39d89e6e55a2cbfd 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 38a73ccca251ed1ac00f6a0be1280dd0cb16fcf4..fc5ca23787be17f57825616d9b4aacd9937dd887 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
index 72f4a8208386d5a9d842fd993adeabb9236a66c5..6a41ef4d84bb6702a7351704f9e729b53272f478 100644
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 3da73c6c407d4bb6be9c822164b6e2a15dd1af01..6a08de2d2c38c253e57d2e062d20a1de35eb119f 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
index c1edfb52ff70cdf93c5dcc47d324df17f254b2e6..b1c1ef620da8ddef49910f3f2fd6ef168a22757d 100644
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use Ray Data for data parallel batch inference.
 
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index 61230d89558428232cde6fa6f692750046b84435..6e56e24f2092c984611d2e0129bb9cbfe41b78b5 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 import json
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
new file mode 100644
index 0000000000000000000000000000000000000000..df39e4c25d5c8b796d0fe47bf05db8c8f9303f38
--- /dev/null
+++ b/examples/offline_inference/context_extension.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
+
+from vllm import LLM, SamplingParams
+
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
+
+
+def print_outputs(outputs):
+    print("\nGenerated Outputs:\n" + "-" * 80)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n")
+        print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index bf60d883c410e369fede8e56d9bb3a2ca86261b1..3eccb4e11ab6f6de920948e03eb40dd5f2b6ac38 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Usage:
 Single node:
@@ -97,10 +98,14 @@ def main(
     # with DP, each rank should process different prompts.
     # usually all the DP ranks process a full dataset,
     # and each rank processes a different part of the dataset.
-    promts_per_rank = len(prompts) // dp_size
-    start = global_dp_rank * promts_per_rank
-    end = start + promts_per_rank
-    prompts = prompts[start:end]
+    floor = len(prompts) // dp_size
+    remainder = len(prompts) % dp_size
+
+    # Distribute prompts into even groups.
+    def start(rank):
+        return rank * floor + min(rank, remainder)
+
+    prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
     if len(prompts) == 0:
         # if any rank has no prompts to process,
         # we need to set a placeholder prompt
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
index 4ae5d3310e0bf8c8625fd8b6c365177926f0c6b1..8f3d1a5c003691a1cf8110c19fb9f74ed8a5ce33 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
index 5757a8a84b86a3d9a8b3ff893cecd5bdf72d41f7..0bfe7ec0e6cf6e79354543bb676946922df4d9a3 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh
index 0ebf45a1586a012d9075219cb37318d68fce00bc..c1dcc95a2bd0b594cdcc967f179929910136b89b 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/run.sh
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
@@ -1,5 +1,11 @@
 rm -rf local_storage/
-rm output.txt
 
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
+if [ -f "output.txt" ]; then
+    rm output.txt
+fi
+
+# The directory of current script
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 3ccab0dcd6d32e448a59c3bff768916e98977a3d..05a361fee071711912f8625b1365eb23c938b130 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index 606ce7799a88fe1aefac67154c6f0022e52a89af..ce977ee99bb8fb5dc456bd559a860ca12f6f1f9d 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index 23f60c431fc247155dcef27a78db7481d412b879..e68128399ba2122d7c40ce970a9be3f3f766f08f 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 59c0592ae9e237f243350152eca02f61bc653cb7..7f5d74d9a3ae07768c0ee40d4f0e51c1901d8643 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 83dd1f667eb5f4ce0ffa9260274742f559dcf3c6..0da6fa5c4af5fb0d7739c2caf667846617589c1d 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index ae3737e37594182c12c187fefbe894552e5d5008..d27a902edb7e7622e637fc873618c093a5a619be 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index 5d5e55a83d2215def2de47dab1c688821136ad6d..d7f2a1633113d0f2b92b232affd20f50ccace3f9 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates using the `LLMEngine`
 for processing prompts with various sampling parameters.
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index 5bb2327a3f83ee93816b3d08a161e16e2d10c3b5..cc78c0cbbf7c07ad5dadee1eba89776d5376ce3d 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Validates the loading of a model saved with the sharded_state format.
 This script demonstrates how to load a model that was previously saved
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 33c660015ba763576af85e835f6906a5312d30be..00d4cb9eb4c41948780e6a2beb0d71fb8c94b06b 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.
diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py
index 7927f758cb5754ff29cb25ae8eac080ef4ae90bd..00fb3f5bc89171c6cd2807c019d00b7a1f6b64fa 100644
--- a/examples/offline_inference/metrics.py
+++ b/examples/offline_inference/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 98fef2648f6bbd0e7b1009864f5d50062c035801..330103d5818a3ebd580d60b1aa6b13b0c00c5136 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 import argparse
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index b750397f45b8d8d8918b3f963682b4053c88013f..d5b1b4ad29a92dfd41d6ab9ef07f4221c54a57c2 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the usage of text generation with an LLM model,
 comparing the performance with and without speculative decoding.
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 1fa2f16f82a8aa501848d1bd0075383df6266141..f0c00bcaaeb110c4a127f14ccc0b7f69eafeec8a 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f2d7698f22d7c6aee19b7b8005aee1214808b461..7826629a36d01dc7990e99c4efa3b81a6b3d3ddf 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
index 5d7fb819d347702110ffbf93d8f632ff4dd49476..0b2070c8e253105f6b7cb5f591ef2658e056f032 100644
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to run offline inference with an EAGLE speculative
 decoding model on neuron. To use EAGLE speculative decoding, you must use
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index ec38525b9daf289e84f1720fdd59b5ba71bf0619..c0ecfac508996974d2dd7c2344a044e17cda8be2 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
new file mode 100644
index 0000000000000000000000000000000000000000..26f7505f2fa53b4b9d0a677447972a4ab5f4d554
--- /dev/null
+++ b/examples/offline_inference/neuron_multimodal.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import requests
+import torch
+from neuronx_distributed_inference.models.mllama.utils import add_instruct
+from PIL import Image
+
+from vllm import LLM, SamplingParams, TextPrompt
+
+
+def get_image(image_url):
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    return image
+
+
+# Model Inputs
+PROMPTS = [
+    "What is in this image? Tell me a story",
+    "What is the recipe of mayonnaise in two sentences?",
+    "Describe this image",
+    "What is the capital of Italy famous for?",
+]
+IMAGES = [
+    get_image(
+        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
+    ),
+    None,
+    get_image(
+        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
+    ),
+    None,
+]
+SAMPLING_PARAMS = [
+    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
+    for _ in range(len(PROMPTS))
+]
+
+
+def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
+    # Prepare all inputs for mllama generation, including:
+    # 1. put text prompt into instruct chat template
+    # 2. compose single text and single image prompt into Vllm's prompt class
+    # 3. prepare sampling parameters
+    input_image = single_image
+    has_image = torch.tensor([1])
+    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
+        has_image = torch.tensor([0])
+
+    instruct_prompt = add_instruct(prompt, has_image)
+    inputs = TextPrompt(prompt=instruct_prompt)
+
+    if input_image is not None:
+        inputs["multi_modal_data"] = {"image": input_image}
+
+    sampling_params = SamplingParams(**sampling_params)
+    return inputs, sampling_params
+
+
+def print_outputs(outputs):
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    assert (
+        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
+    ), f"""Text, image prompts and sampling parameters should have the 
+            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
+            and {len(SAMPLING_PARAMS)}"""
+
+    # Create an LLM.
+    llm = LLM(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_num_seqs=1,
+        max_model_len=4096,
+        block_size=4096,
+        device="neuron",
+        tensor_parallel_size=32,
+        override_neuron_config={
+            "sequence_parallel_enabled": False,
+            "skip_warmup": True,
+            "save_sharded_checkpoint": True,
+            "on_device_sampling_config": {
+                "global_topk": 1,
+                "dynamic": False,
+                "deterministic": False,
+            },
+        },
+    )
+
+    batched_inputs = []
+    batched_sample_params = []
+    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
+        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
+        # test batch-size = 1
+        outputs = llm.generate(inputs, sampling_params)
+        print_outputs(outputs)
+        batched_inputs.append(inputs)
+        batched_sample_params.append(sampling_params)
+
+    # test batch-size = 4
+    outputs = llm.generate(batched_inputs, batched_sample_params)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
index ecacbab771c2a1776305cd24ddf6eddc455c25c4..2ef69f29863d751d30f1911f715055a1d2fa0bc6 100644
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to run offline inference with a speculative
 decoding model on neuron.
diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md
index 42a19f71e9de313c0f17679b554272b3e10b6926..ce75297821221ab6d962a9b440ed90db014cbf38 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```console
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```console
+vllm run-batch \
+    -i offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h
 For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
 
 ```console
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+or use command-line:
+
+```console
+vllm run-batch \
+    -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
+    -o results.jsonl \
+    --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \
     --model --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
+or use command-line:
+
+```console
+vllm run-batch \
+    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    --model --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
 ### Step 4: View your results
 
 Your results are now on S3. You can view them in your terminal by running
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index d3dad24956a69ff57cbd8a619a7468b32e802e69..69989138239479af0020860284d57bcba698dcbd 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 21f7668adc863463a2c6e06eb505b287b7cb41ca..567c448a8c97b611e4576cb85706b012b4bb98d1 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This is a demo script showing how to use the
 PrithviGeospatialMAE model with vLLM
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 244a64b891c96d24cbbf37c26e8b533f2a65613e..392fba8fc5ead28ad6f49ccdc04e0fe34962aaad 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 import json
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index 82737d538df4f2f7ab2f3247a7cea8be4462ef96..dfcbd8c8d36059b5505996f37c6e7b2dd0f5a0b9 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import dataclasses
@@ -69,7 +70,7 @@ def main(args: argparse.Namespace):
     return
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description="Benchmark the latency of processing a single batch of "
         "requests till completion."
@@ -101,5 +102,9 @@ if __name__ == "__main__":
     )
 
     parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py
index 9f6a602233f8ae403064e08b06f49c100927973f..5d79222a1bb3a18374008b47aea08fe072337a30 100644
--- a/examples/offline_inference/prompt_embed_inference.py
+++ b/examples/offline_inference/prompt_embed_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates how to generate prompt embeddings using
 Hugging Face Transformers  and use them as input to vLLM
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 6482490d1a93a7245705d5253ed72b1b08484ca3..62effd5c8b62e3b40bb078b62628405159652e21 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
 with the correct prompt format on Qwen2.5-Omni (thinker only).
diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py
index 856a35b0e59be20a00537a502d313bfc9c0fd853..d8d61667f688b0a47cfb221b3f4780bd8f1032d3 100644
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from urllib.request import urlopen
 
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index 6d048986e7109afc1dd0947acd2f556b4604f17e..d909438b41042cf252508750f7a88372e323ca8a 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates how to achieve reproducibility in vLLM.
 
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index a8f6977e29a4961f7bbb57bd35c7366d25318223..c6e63531a99d167d69aaef7376d602a4ebd546a7 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 76eafdca1f6c7e0f381209b8ec58ca38ce60ffec..096363e6830175cdd135a36f27fe4b87c3879006 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 a simple demonstration to show how to co-locate
 vLLM worker with training actors on the same GPUs,
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 3461af707eba8f8ba0385b69cf1a79bfb35ed168..c445224d75686cbe258b3b1dd4fe758634feeb87 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 860fe2b5fe067d296fe9f079fcf3a6b9a957f17e..9b154e370642b76a4dd4b4aef74413a59e66c6af 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index d583110c8e69bc5615be361c277aba30cd44de7f..46858fffadc529ebfd3cacdca6419c51107c3703 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 9ed7299606b7e1d8f80fe33fb2e7d30fcf62761d..8ef121ebe848ececc835883986c23d6f838e739b 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of guided decoding
 to generate structured outputs using vLLM. It shows how to apply
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 2fa49c0835e32414d7f04b19fb15a8dc8f0f8e8a..3d3d7946cdb412bb337c2d267990640b956127a4 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index e4a75b3f93803ded4b07732dda025e2505f36206..9776f4fe322b9213721c1a2fd369e92cd08980e4 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
 
 from vllm import LLM, SamplingParams
 
@@ -18,14 +22,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 
 
 def main():
+    parser = argparse.ArgumentParser(description="TPU offline inference example")
+    parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
+    args = parser.parse_args()
+
+    llm_args = {
+        "model": "Qwen/Qwen2-1.5B-Instruct",
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 4,
+        "max_model_len": 128,
+    }
+    if args.use_spmd:
+        os.environ["VLLM_XLA_USE_SPMD"] = "1"
+        # Can only hardcode the number of chips for now.
+        # calling xr.global_runtime_device_count() beforeing init SPMD env in
+        # torch_xla will mess up the distributed env.
+        llm_args["tensor_parallel_size"] = 8
+        # Use Llama, for num_kv_heads = 8.
+        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
+
     # Set `enforce_eager=True` to avoid ahead-of-time compilation.
     # In real workloads, `enforace_eager` should be `False`.
-    llm = LLM(
-        model="Qwen/Qwen2-1.5B-Instruct",
-        max_num_batched_tokens=64,
-        max_num_seqs=4,
-        max_model_len=128,
-    )
+    llm = LLM(**llm_args)
     outputs = llm.generate(prompts, sampling_params)
     print("-" * 50)
     for output, answer in zip(outputs, answers):
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index f0504501639d2b6c8323b9d5727c0653a20a53d2..15dbd9f44128a92c4f33ff794e7a864ae44d2843 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for text generation.
@@ -333,6 +334,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL3-2B"
@@ -1091,6 +1111,7 @@ model_example_map = {
     "qwen2_5_omni": run_qwen2_5_omni,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
+    "tarsier": run_tarsier,
 }
 
 
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index cee02d06c607ca500fe2fa739023e8ba7c2ade23..1f5bd4ad72b05c53130522f085bb747aa7aa106b 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for multimodal embedding.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index e776ff7fe6aecac71a6b7c3419fad9f45c8ebdfa..ea7a793d026b4dc6879d3d7f96a8cd1e877930d1 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models for text generation,
@@ -592,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
 
 def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
-        from qwen_vl_utils import process_vision_info
+        from qwen_vl_utils import smart_resize
     except ModuleNotFoundError:
         print(
             "WARNING: `qwen-vl-utils` not installed, input images will not "
             "be automatically resized. You can enable this functionality by "
             "`pip install qwen-vl-utils`."
         )
-        process_vision_info = None
+        smart_resize = None
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     # Tested on L40
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=32768 if process_vision_info is None else 4096,
+        max_model_len=32768 if smart_resize is None else 4096,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -629,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    if process_vision_info is None:
+    if smart_resize is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages)
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -643,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
 def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
-        from qwen_vl_utils import process_vision_info
+        from qwen_vl_utils import smart_resize
     except ModuleNotFoundError:
         print(
             "WARNING: `qwen-vl-utils` not installed, input images will not "
             "be automatically resized. You can enable this functionality by "
             "`pip install qwen-vl-utils`."
         )
-        process_vision_info = None
+        smart_resize = None
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=32768 if process_vision_info is None else 4096,
+        max_model_len=32768 if smart_resize is None else 4096,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -679,10 +688,38 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    if process_vision_info is None:
+    if smart_resize is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages, return_video_kwargs=False)
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
+    image_data = [fetch_image(url) for url in image_urls]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -712,6 +749,7 @@ model_example_map = {
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
     "smolvlm": load_smolvlm,
+    "tarsier": load_tarsier,
 }
 
 
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index cc190e91c141d98a9aa80b47897d03f613e37808..84854911bade1bffa3ebdfb9a2fba24070e3878f 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for `vllm.entrypoints.api_server`
 Start the demo server:
     python -m vllm.entrypoints.api_server --model <model_name>
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index e57b94e8805f95d06e05a87bb9f0ec3d94ecb587..63c9ff9e9398031ea0a710296628af34e845f490 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index 2ffba4a7ed3f9adc8a4ed96d2921256a1403c232..16c32dcaa5d312ee8442be2c3aa8a242d140dc38 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file provides a disaggregated prefilling proxy demo to demonstrate an
 example usage of XpYd disaggregated prefilling.
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 3f2a3d01b456391847f28cc9dbbc6f9428645fd8..d5d0a07a29183bad4f7a0dc28877c8ef86ea669d 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example for starting a Gradio OpenAI Chatbot Webserver
 Start vLLM API server:
     vllm serve meta-llama/Llama-2-7b-chat-hf
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index fd341ff493b56f07bd99f348d4df779cb194ddb5..86d9ceb48bb048fd2e0b3aa01226f052145dc260 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example for starting a Gradio Webserver
 Start vLLM API server:
     python -m vllm.entrypoints.api_server \
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index 7eb3d2193f41bfdcdd6c6967587168603032c016..908d6a9240aa9303e3dd16f5bfba06422b621400 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 Jina and Cohere https://jina.ai/reranker
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 65d74dccab807546b952bf70b907b1a21cf74444..584db53db4e400044263113f516909fd9464d651 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional, Union
 
 import msgspec
diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b1ec71af14d42b58d315e30bdbd752f788c55c
--- /dev/null
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+from typing import Optional
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+
+
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+    )
+
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index 2856e3be3e2dd53cca2f3e928a7e8cd6f1fead96..def95deb0c95d8954fce642cbc9b2a5875236d75 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for OpenAI Chat Completion using vLLM API server
 NOTE: start a supported chat completion model server with `vllm serve`, e.g.
     vllm serve meta-llama/Llama-2-7b-chat-hf
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 8c3c6ecdd4b01556285b7c9286576cd184c5c027..c99b5148de875c0e07462873e21f986626bb5324 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """An example showing how to use vLLM to serve multimodal models
 and run online serving with OpenAI client.
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index a0d7841f644fcb94233b6922683cbd75062fec4a..41dbb3236297c37b5f9c51883551418575f702f3 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled. For example:
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 45c4232fe1dea8de151a4f821e9caa629834a46e..7eb8668213eef932055d03f936bac723bcfe0329 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 To run this example, you can start the vLLM server
 without any specific flags:
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index a4134ea43c4b3a14c8917a1d090c96141b38e27c..5c55d53138a8f89048d3f678c40355497932b0d5 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 To run this example, you need to start the vLLM server:
 
@@ -138,7 +139,6 @@ def extra_backend_options_completion(client: OpenAI, model: str):
             extra_body={
                 "guided_regex": r"\w+@\w+\.com\n",
                 "stop": ["\n"],
-                "guided_decoding_backend": "xgrammar",
                 "guided_decoding_disable_fallback": True,
             },
         )
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
index c73208abe6005273f351d78a22afcc852a5b3a19..ec7d8b95472e6bf3cf15aa751a902e49901b9cda 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai import OpenAI
 
 # This example demonstrates the `structural_tag` response format.
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 1ca61a8d5895f0edfd6fd55fc7c2cdd34724de56..bfbee7513874a5bee7aabca1ab7d4b5af23bdab8 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate structured outputs from reasoning models
 like DeepSeekR1. The thinking process will not be guided by the JSON
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index a5febad45863b72237389d43c3e5fae53f3b9715..4006d07f73b0011452b4076a86a415f303499106 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example demonstrates how to use tool calling with reasoning models 
 like QwQ-32B. The reasoning_content will not be parsed by the tool 
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index f6b8082115f125c48c03e317ff1ddaa77a082909..932dbeb2e7a2425c65af70eb8201f51a9029874f 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index f984fbabf24fd29ca60060a09be59484e52ba6a3..5a91929770945811991c808c07124e8dfcd99c48 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index ee519e555ff7f73808c351448e34d0ced718c49f..70f3c2f19cf14cf52640fff0e368f786945c6da3 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import base64
diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py
index 649cfa5d6686be367cc1ed6fcee529eeb4c5b28c..b10e7acbd26c1f014eea607b634811ee5a8775ca 100644
--- a/examples/online_serving/openai_classification_client.py
+++ b/examples/online_serving/openai_classification_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import pprint
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index b1d21b5e4b9f7078dfdd79c60a97eedfff4e90db..df6e4e9429650b7db914f585f4ccadb643d9c887 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 7891e14cb71e285ce3bd0c8df2275b2915f8c364..2e0d168d615c67bd6312a17b41d103020a2bc117 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example online usage of Score API.
 
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index a055654e91332d998e4d0cb128e41f6d8ecfe742..6bc390861e2eeef859a38925e7fcb46271c091cf 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from openai import OpenAI
 
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
index 4544dcfb5ab09edbcf0a6ba416572f4c9c2142f3..653da8d18b705aa9290541245cc5aaebac017ccc 100644
--- a/examples/online_serving/openai_embedding_matryoshka_fy.py
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for embedding API dimensions using vLLM API server
 NOTE:
     start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 2620a12320241752fdc93f1a4cc1bf9a13408404..8252b36705cc6fff104e055c14f7276a9a5131a9 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example online usage of Pooling API.
 
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index eb501ae72aa9f6899ce2cbc399f925291ccc7d78..12d45de3c81b0a99ba0b2d36474b4c823fad5606 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import json
 
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index 33d365f0caa568f6955b324458295c6265b80a91..018d986ad8732ad591739e83558e37ae0a2e9b89 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
index fbe96b48e799556b7a168c95dafeb724941c2dfc..3488956a5b24cb8e1f8fd44875a04ecd8977cb15 100644
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -577,23 +577,6 @@
           "refId": "A",
           "useBackend": false
         },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "Num Swapped",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        },
         {
           "datasource": {
             "type": "prometheus",
@@ -874,19 +857,6 @@
           "legendFormat": "GPU Cache Usage",
           "range": true,
           "refId": "A"
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
-          "hide": false,
-          "instant": false,
-          "legendFormat": "CPU Cache Usage",
-          "range": true,
-          "refId": "B"
         }
       ],
       "title": "Cache Utilization",
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 85ea2340736e82c5713e19ff4eb20855a0d8812c..3a90421383775d95f0f7bcdd88aa694d8fe60ba0 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vLLM OpenAI-Compatible Client with Prompt Embeddings
 
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
index a76020130c3ac395393cc91bfac14ed2d637692a..9471563ddb76bab91b1bf4d5de81bde0612178ae 100644
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
 See more details at:
diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
index 37af3b3887f57cd915673847095a939c02b25d24..d9a4cadb036e200e1414433ffacc7075ae6aa00c 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Retrieval Augmented Generation (RAG) Implementation with Langchain
 ==================================================================
diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
index 08796b1b3a546f25c8bec3721bf4ffd39fde5775..be4796acd1b67c716d4ac9a3e0e755129ec3e7b0 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
 ================================================================
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
index 0722aa671f66beaf009069a10bfa53b3d278f31e..dab56172ee3a32dbd6c5d0371931a61407720a6e 100644
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vLLM Chat Assistant - A Streamlit Web Interface
 
diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py
index 0781a27f19c51599e2eb916a84c41f3631e99745..a512d8a31b53ee2225ec321cd78e3b003d3a0641 100644
--- a/examples/online_serving/utils.py
+++ b/examples/online_serving/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai import APIConnectionError, OpenAI
 from openai.pagination import SyncPage
 from openai.types.model import Model
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index 98eafb31ed4f14d0ef6605385aaeb99f149e3fb1..354e4cc8c5723a7e746fc64e6622d9886134aa7c 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of cpu offloading
 with LMCache in vLLM v1 or v0.
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
index b2b7b3b2c1f97ce365ea36b44f2800ce7be11a85..6669eb3fb3d3890ef78029cda7936c36faeb0c9f 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 with LMCache.
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index df8a41293504908016d3710ff5fa713805c55fee..0b6c9213ebfff6eeb7a8416644a006c913ac73e0 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -33,7 +33,7 @@ check_num_gpus() {
 
 ensure_python_library_installed() {
     echo "Checking if $1 is installed..."
-    python -c "import $1" > /dev/null 2>&1
+    python3 -c "import $1" > /dev/null 2>&1
     if [ $? -ne 0 ]; then
         if [ "$1" == "nixl" ]; then
             echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
@@ -121,8 +121,8 @@ main() {
     echo "All servers are up. Starting benchmark..."
 
     # begin benchmark
-    cd ../../../benchmarks/
-    python benchmark_serving.py --port 9000 --seed $(date +%s) \
+    cd ../../../../benchmarks/
+    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
         --model meta-llama/Llama-3.1-8B-Instruct \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
index 20155c2036580c9824ec5c170fc1aca2df12bed1..5d8e38c73b89af5e0033d38664e95877ee266b2b 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
index 89945d67a6f388ba429a27cd065a0113ede106ac..508cf4a5a498751c106ec31c1ea0f581273ab575 100644
--- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of remote KV cache sharing
 with LMCache.
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 17577763083347b9f43a6945bdcdaf9339539df6..9e1003a5c39d0722c5c92f1d85f2025acd35a8d7 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import dataclasses
diff --git a/examples/tool_chat_template_deepseekr1.jinja b/examples/tool_chat_template_deepseekr1.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..9ae19341fc48ac5b6e3b78fe4cd034d6917b3a66
--- /dev/null
+++ b/examples/tool_chat_template_deepseekr1.jinja
@@ -0,0 +1,92 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}
+{{ ns.system_prompt }}
+{%- for message in messages %}
+    {% set content = message['content'] %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + content + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' %}
+        {% if '</think>' in content %}
+            {% set content = content.split('</think>')[-1] %}
+        {% endif %}
+    {% endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if content is none %}
+                    {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{content + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>' + content + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {{content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{'\n<｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{'<｜Assistant｜>'}}
+{% endif %}
diff --git a/find_cuda_init.py b/find_cuda_init.py
index 0d13b2f862102718acf362f9df729c61bb277220..308fc6fc2d61c3c45122ba0c74aa115748d36b78 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 import traceback
diff --git a/pyproject.toml b/pyproject.toml
index 62a734d795d5dd435df3811dc39e9f0a75e44c7c..307878f7e38d691cc5a8a3b61c1dfeecbb01cb6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,14 +1,13 @@
 [build-system]
 # Should be mirrored in requirements/build.txt
 requires = [
-    "cmake>=3.26",
+    "cmake>=3.26.1",
     "ninja",
     "packaging>=24.2",
     "setuptools>=77.0.3,<80.0.0",
     "setuptools-scm>=8.0",
     "torch == 2.7.0",
     "wheel",
-    "regex",
     "jinja2",
 ]
 build-backend = "setuptools.build_meta"
@@ -110,6 +109,7 @@ ignore = [
 ]
 
 [tool.mypy]
+plugins = ['pydantic.mypy']
 ignore_missing_imports = true
 check_untyped_defs = true
 follow_imports = "silent"
@@ -171,7 +171,8 @@ plugins.md033.enabled = false # inline-html
 plugins.md046.enabled = false # code-block-style
 plugins.md024.allow_different_nesting = true # no-duplicate-headers
 
-[tool.ty]
+[tool.ty.src]
+root = "./vllm"
 respect-ignore-files = true
 
 [tool.ty.environment]
diff --git a/requirements/build.txt b/requirements/build.txt
index 320e5b89258433c30dacde4cba697a2ea1d45191..528cd3b538efd2e5042c6c23d229993a4524bd7b 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-cmake>=3.26
+cmake>=3.26.1
 ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/common.txt b/requirements/common.txt
index 625efc3366f48c7202ff31cc4ef9813b8e79a97c..871df3d21e74d135a88fb5f90a3b101a5448cbca 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -14,7 +14,7 @@ protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
-pydantic >= 2.9
+pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.4 # required for compressed-tensors
+compressed-tensors == 0.10.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 1213301584ce341a0bc5dbedede7571e9ed71f3d..d7b0fc6d80a7464e254dc254408a00fa0113a676 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,6 +1,9 @@
 # Common dependencies
 -r common.txt
 
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+
 # Dependencies for CPUs
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
@@ -24,3 +27,5 @@ triton==3.2.0; platform_machine == "x86_64"
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
 intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
+py-libnuma; platform_system != "Darwin"
+psutil; platform_system != "Darwin"
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index e9b466d3a82d62048a61d808bda3b058180d3f73..3475ada9f4c962c6cc199678435ff8e7b8e2c398 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -9,7 +9,9 @@ pytest-shard
 pytest-timeout
 
 librosa # required by audio tests in entrypoints/openai
-sentence-transformers
+sentence-transformers # required for embedding tests
+transformers==4.51.3
+transformers_stream_generator # required for qwen-vl test
 numba == 0.61.2; python_version > '3.9'
 # testing utils
 boto3
@@ -38,4 +40,7 @@ matplotlib # required for qwen-vl test
 # required for  Multi-Modal Models Test (Standard)
 num2words # required for smolvlm test
 pqdm
-timm # required for internvl test
\ No newline at end of file
+timm # required for internvl test
+
+schemathesis>=3.39.15  # Required for openai schema test.
+mteb>=1.38.11, <2 # required for mteb test
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 981b90632c182b5ad67e22f189c46d96896fb52f..94201543cd4f30bc8a567ade430dcab73ab565cf 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -7,7 +7,7 @@ torchvision==0.22.0
 torchaudio==2.7.0
 
 triton==3.2
-cmake>=3.26,<4
+cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 8a84f2ff1ed01c0bc01e8d84677a682bbf285a05..d33021fc7597e5b9bd433a2f2d04a1c20ba9ebcc 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,5 +12,8 @@ ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
diff --git a/requirements/test.in b/requirements/test.in
index 87af6176903887007cc710a60d32b6e0f52d9afe..bbbd41e168a60016964d75a345f9965b278b7bb5 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.51.3
+transformers==4.52.4
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
@@ -51,3 +51,4 @@ numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
\ No newline at end of file
diff --git a/requirements/test.txt b/requirements/test.txt
index 89d477017342ebaaa29b82123fe30584da28ae10..fb0eede080ff1dd2bcb60e6ecd0dd7e019cb629e 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -10,9 +10,13 @@ aiohappyeyeballs==2.4.3
     # via aiohttp
 aiohttp==3.10.11
     # via
+    #   aiohttp-cors
     #   datasets
     #   fsspec
     #   lm-eval
+    #   ray
+aiohttp-cors==0.8.1
+    # via ray
 aiosignal==1.3.1
     # via
     #   aiohttp
@@ -57,6 +61,8 @@ bounded-pool-executor==0.0.3
     # via pqdm
 buildkite-test-collector==0.1.9
     # via -r requirements/test.in
+cachetools==5.5.2
+    # via google-auth
 certifi==2024.8.30
     # via
     #   httpcore
@@ -81,6 +87,8 @@ colorama==0.4.6
     #   sacrebleu
     #   schemathesis
     #   tqdm-multiprocess
+colorful==0.5.6
+    # via ray
 contourpy==1.3.0
     # via matplotlib
 cramjam==2.9.0
@@ -108,6 +116,8 @@ dill==0.3.8
     #   evaluate
     #   lm-eval
     #   multiprocess
+distlib==0.3.9
+    # via virtualenv
 dnspython==2.7.0
     # via email-validator
 docopt==0.6.2
@@ -143,6 +153,7 @@ filelock==3.16.1
     #   ray
     #   torch
     #   transformers
+    #   virtualenv
 fonttools==4.54.1
     # via matplotlib
 fqdn==1.5.1
@@ -165,8 +176,16 @@ genai-perf==0.0.8
     # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
+google-api-core==2.24.2
+    # via opencensus
+google-auth==2.40.2
+    # via google-api-core
+googleapis-common-protos==1.70.0
+    # via google-api-core
 graphql-core==3.2.6
     # via hypothesis-graphql
+grpcio==1.71.0
+    # via ray
 h11==0.14.0
     # via httpcore
 harfile==0.3.0
@@ -392,6 +411,10 @@ nvidia-nvjitlink-cu12==12.8.61
     #   torch
 nvidia-nvtx-cu12==12.8.55
     # via torch
+opencensus==0.11.4
+    # via ray
+opencensus-context==0.1.3
+    # via opencensus
 opencv-python-headless==4.11.0.86
     # via
     #   -r requirements/test.in
@@ -445,6 +468,7 @@ platformdirs==4.3.6
     # via
     #   black
     #   pooch
+    #   virtualenv
 plotly==5.24.1
     # via genai-perf
 pluggy==1.5.0
@@ -457,10 +481,17 @@ portalocker==2.10.1
     # via sacrebleu
 pqdm==0.2.0
     # via -r requirements/test.in
+prometheus-client==0.22.0
+    # via ray
 propcache==0.2.0
     # via yarl
+proto-plus==1.26.1
+    # via google-api-core
 protobuf==5.28.3
     # via
+    #   google-api-core
+    #   googleapis-common-protos
+    #   proto-plus
     #   ray
     #   tensorizer
 psutil==6.1.0
@@ -470,22 +501,32 @@ psutil==6.1.0
     #   tensorizer
 py==1.11.0
     # via pytest-forked
+py-spy==0.4.0
+    # via ray
 pyarrow==18.0.0
     # via
     #   datasets
     #   genai-perf
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
 pybind11==2.13.6
     # via lm-eval
 pycparser==2.22
     # via cffi
 pycryptodomex==3.22.0
     # via blobfile
-pydantic==2.9.2
+pydantic==2.11.5
     # via
+    #   -r requirements/test.in
     #   datamodel-code-generator
     #   mistral-common
     #   mteb
-pydantic-core==2.23.4
+    #   ray
+pydantic-core==2.33.2
     # via pydantic
 pygments==2.18.0
     # via rich
@@ -572,6 +613,7 @@ requests==2.32.3
     #   buildkite-test-collector
     #   datasets
     #   evaluate
+    #   google-api-core
     #   huggingface-hub
     #   lm-eval
     #   mistral-common
@@ -600,6 +642,8 @@ rpds-py==0.20.1
     # via
     #   jsonschema
     #   referencing
+rsa==4.9.1
+    # via google-auth
 runai-model-streamer==0.11.0
     # via -r requirements/test.in
 runai-model-streamer-s3==0.11.0
@@ -647,9 +691,12 @@ shellingham==1.5.4
 six==1.16.0
     # via
     #   junit-xml
+    #   opencensus
     #   python-dateutil
     #   rfc3339-validator
     #   rouge-score
+smart-open==7.1.0
+    # via ray
 sniffio==1.3.1
     # via
     #   anyio
@@ -747,7 +794,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.51.3
+transformers==4.52.4
     # via
     #   -r requirements/test.in
     #   genai-perf
@@ -784,6 +831,9 @@ typing-extensions==4.12.2
     #   pydantic-core
     #   torch
     #   typer
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
 tzdata==2024.2
     # via pandas
 uri-template==1.3.0
@@ -797,6 +847,8 @@ urllib3==2.2.3
     #   tritonclient
 vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
+virtualenv==20.31.2
+    # via ray
 vocos==0.1.0
     # via -r requirements/test.in
 webcolors==24.11.1
@@ -805,6 +857,8 @@ werkzeug==3.1.3
     # via schemathesis
 word2number==1.1
     # via lm-eval
+wrapt==1.17.2
+    # via smart-open
 xxhash==3.5.0
     # via
     #   datasets
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 3b204a8f99056b299358913e9a5d9fe2601fac5b..a26dfd460d8efacf86abea088f8ace2e37dbfb11 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 # Dependencies for TPU
-cmake>=3.26
+cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
 wheel
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250518
-torchvision==0.22.0.dev20250518
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250605
+torchvision==0.23.0.dev20250605
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 04c4d4ff85a0db4fdaa1aba650b9d73b4ac54394..3cb6a4a8addaccfb6087182fc3a569099385c295 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 ray>=2.9
-cmake>=3.26
+cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
 setuptools>=77.0.3,<80.0.0
diff --git a/setup.py b/setup.py
index 09005f54db5a6b18876074a19d01a7c007adfe46..67fbdf362591960cb047ca153ba9756708db8614 100644
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ctypes
 import importlib.util
 import json
 import logging
 import os
+import re
 import subprocess
 import sys
 from pathlib import Path
 from shutil import which
 
-import regex as re
 import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
@@ -251,11 +252,8 @@ class cmake_build_ext(build_ext):
 
             # CMake appends the extension prefix to the install path,
             # and outdir already contains that prefix, so we need to remove it.
-            # We assume only the final component of extension prefix is added by
-            # CMake, this is currently true for current extensions but may not
-            # always be the case.
             prefix = outdir
-            if '.' in ext.name:
+            for _ in range(ext.name.count('.')):
                 prefix = prefix.parent
 
             # prefix here should actually be the same for all components
@@ -690,6 +688,7 @@ setup(
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
+        "bench": ["pandas", "datasets"],
         "tensorizer": ["tensorizer>=2.9.0"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 1e3c2d1a473a3356c6e7016b11539bf9a45bbc28..ec6b20f5e04b92f6537657851f660e26a51a3af6 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from collections.abc import Iterable
 from typing import Any
@@ -7,6 +8,7 @@ import uvicorn
 from fastapi.responses import JSONResponse, Response
 
 import vllm.entrypoints.api_server
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.utils import FlexibleArgumentParser
@@ -45,9 +47,8 @@ if __name__ == "__main__":
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
     vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level="debug",
-        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
index 1a20e2c135c2ef40fb0feda5ae4e11e873eb8ac9..375b248ebedaaed26ed19e90f78373d3d528fcc2 100644
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 410cece795e94ad3d6dca41d862d412ef4997c0e..38ecaf2233d99adccbb2d6dd19462fd92e878b5a 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import subprocess
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index b6f44871497c8bcd9ccb243c395fde4e02d74cd8..043b75cc5d385d7cff5e09c9fc16236ac97e3642 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
@@ -383,3 +384,25 @@ async def test_delayed_generator(async_engine, stop):
     assert final_output is not None
     assert len(final_output.outputs[0].token_ids) == 10
     assert final_output.finished
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_invalid_argument(async_engine):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    # Targeting specific DP rank only supported in v1 multi-instance DP
+    with pytest.raises(ValueError):
+        async for _ in async_engine.generate("test",
+                                             sampling_params,
+                                             request_id=uid(),
+                                             data_parallel_rank=0):
+            pass
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index fd6d89d4e00de91a21e67ed24dae207106e5c189..1851eeeda790582c2c993c25cff6bd2904f74661 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 86b5e1e0ab7cf4bf9ed12b611f49089c611628b1..2e103019f7af62d5df1f4035325ade41ba9fac36 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
@@ -60,7 +61,6 @@ def _fix_prompt_embed_outputs(
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
-@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
@@ -69,7 +69,6 @@ def test_models(
     hf_runner,
     model: str,
     backend: str,
-    dtype: str,
     max_tokens: int,
     enforce_eager: bool,
     enable_prompt_embeds: bool,
@@ -97,7 +96,7 @@ def test_models(
             str(i) for i in range(1024)) + " are:"
         example_prompts = [prompt]
 
-        with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model) as hf_model:
             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
             if enable_prompt_embeds:
                 with torch.no_grad():
@@ -106,7 +105,6 @@ def test_models(
 
         with VllmRunner(model,
                         max_model_len=8192,
-                        dtype=dtype,
                         enforce_eager=enforce_eager,
                         enable_prompt_embeds=enable_prompt_embeds,
                         gpu_memory_utilization=0.7) as vllm_model:
@@ -130,15 +128,21 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("distilbert/distilgpt2", "ray", "", "L4"),
-        ("distilbert/distilgpt2", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("distilbert/distilgpt2", "ray", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    "test_suite, extra_env", [
+        ("distilbert/distilgpt2", "ray", "", "L4", {}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("distilbert/distilgpt2", "mp", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
     ])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
@@ -150,6 +154,7 @@ def test_models_distributed(
     distributed_executor_backend: str,
     attention_backend: str,
     test_suite: str,
+    extra_env: dict[str, str],
     enable_prompt_embeds: bool,
 ) -> None:
 
@@ -175,6 +180,9 @@ def test_models_distributed(
                 attention_backend,
             )
 
+        for k, v in extra_env.items():
+            monkeypatch_context.setenv(k, v)
+
         dtype = "half"
         max_tokens = 5
 
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 06c9e25ed8dd8a051ed9aebbd7cabc7aa9d3f4c7..eb5b09ff74f6063e2d311d29554575db723d20fe 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 It tests chunked prefill. Chunked prefill can be enabled by
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index be3ad12396b4b1b00b8625cbda93e6107a00b42a..28bfe9e7c8020de5c3fd8a90747cd59880067a75 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ..utils import compare_two_settings
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 76b266aada68416caa37cd53b3d8def1c0f43e24..34f9389c82a9bb468d0d026a7ea35ad6ae134327 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 63dc0f8c8e3b2f4cdf7572336749eb4bc1ce3048..341a39a42b85ec6973c1c281cd87f9ffd56ea8bc 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
index 8537459b9f94dea5417ae2e84e14b5b967c70765..2279c846e01cd5a206166b7a21df0745bd481f0a 100644
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index b746d6b7853c9dbcff8cbb29886b291bcbeeb0c4..a3181952677fd1d975a7225e09c313815224ca23 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
index 2045b362935658d381c5f65ab1bfc27b738108af..b61e51db4fbe46ef69f1975b3e238052ebe7c3b4 100644
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/build_cython.py b/tests/build_cython.py
index 9dea6bcd62f3fd07a286547255acc3d92761c110..f4a334aa3b4842ef461d5c442d21bccfd5d5bff4 100644
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import Cython.Compiler.Options
 from Cython.Build import cythonize
 from setuptools import setup
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 5a02c4e2b378248a1614e9b69fa95f788ac7613f..60334f5e4f683151f53ba3225255e88ea7c0ccd9 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
 from typing import Callable, Union
diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
deleted file mode 100644
index 7118810a58614ba746ee3dee7807e3aea11b9c64..0000000000000000000000000000000000000000
--- a/tests/compile/conftest.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-
-# TEST V1: this should be removed. Right now V1 overrides
-# all the torch compile logic. We should re-enable this
-# as we add torch compile support back to V1.
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index a71a40cda73ea07634961b31d6a1a4e83dde5a88..134bade486079bbac5aed469e3f8b276d9f20cde 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os
 
@@ -6,6 +7,7 @@ import pytest
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
+from vllm.platforms import current_platform
 
 MODEL = "Qwen/Qwen2-1.5B-Instruct"
 
@@ -36,7 +38,7 @@ def full_cudagraph_llm():
             "VLLM_FLASH_ATTN_VERSION": "3"
     }):
         return LLM(model=MODEL,
-                   gpu_memory_utilization=0.2,
+                   gpu_memory_utilization=0.3,
                    compilation_config=CompilationConfig(full_cuda_graph=True))
 
 
@@ -47,7 +49,7 @@ def piecewise_llm():
             "VLLM_FLASH_ATTN_VERSION": "3"
     }):
         return LLM(model=MODEL,
-                   gpu_memory_utilization=0.5,
+                   gpu_memory_utilization=0.6,
                    compilation_config=CompilationConfig())
 
 
@@ -60,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int):
     return llm.generate(prompts, sampling_params)
 
 
+@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
+                    reason="Only Hopper GPUs support FlashAttention 3")
 @pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10),
                                                         (16, 10), (25, 10),
                                                         (32, 10), (45, 10),
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 143cb49697f5b270ee1f8cae91d47e82d4519734..9633f139873c48dca8f47b7e345ea68e3ebd68e5 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
@@ -12,6 +13,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                          set_current_vllm_config)
+from vllm.envs import VLLM_USE_V1
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -74,11 +76,13 @@ class SillyModel(nn.Module):
         return x
 
 
-def test_simple_piecewise_compile():
+def _test_simple_piecewise_compile(*, use_inductor):
+    assert VLLM_USE_V1
 
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
         use_cudagraph=True,
+        use_inductor=use_inductor,
         splitting_ops=["silly.attention"],
         cudagraph_copy_inputs=True,
         cudagraph_capture_sizes=[1, 2],
@@ -93,7 +97,7 @@ def test_simple_piecewise_compile():
             num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
@@ -108,3 +112,11 @@ def test_simple_piecewise_compile():
         output = model(input)
         assert global_counter == 2
         assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+
+
+def test_simple_piecewise_compile_inductor():
+    _test_simple_piecewise_compile(use_inductor=True)
+
+
+def test_simple_piecewise_compile_no_inductor():
+    _test_simple_piecewise_compile(use_inductor=False)
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index d4551b1cc3aec886cb1c980b08b49da979cd74bd..410c0101c99b9f4a619791fd108443c3bc190460 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
@@ -261,12 +262,14 @@ def tractable_computation(input_ids: torch.Tensor,
 @torch.inference_mode
 def run_model(llama_config,
               use_compile: bool,
+              use_inductor: bool,
               split_attn: bool = False) -> torch.Tensor:
 
     if use_compile:
         compilation_config = CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
+            use_inductor=use_inductor,
             cudagraph_capture_sizes=[1, 2],
         )
         if split_attn:
@@ -304,7 +307,7 @@ def run_model(llama_config,
         return output.cpu()
 
 
-def test_toy_llama():
+def _test_toy_llama(*, use_inductor):
     # compare output with and without piecewise compilation
 
     llama_config = LlamaConfig(hidden_size=128,
@@ -324,21 +327,31 @@ def test_toy_llama():
             num_piecewise_graphs_seen=0,
             num_piecewise_capturable_graphs_seen=0,
             num_backend_compilations=0,
-            num_cudagraph_caputured=0,
+            num_cudagraph_captured=0,
     ):
-        outputs.append(run_model(llama_config, use_compile=False))
-    run_model(tractable_config, use_compile=False)
+        outputs.append(
+            run_model(llama_config, use_inductor=False, use_compile=False))
+    run_model(tractable_config, use_inductor=False, use_compile=False)
+
+    if use_inductor:
+        kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
+    else:
+        kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=1,
             num_piecewise_capturable_graphs_seen=1,
             num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            **kwargs,
     ):
-        outputs.append(run_model(llama_config, use_compile=True))
-    run_model(tractable_config, use_compile=True)
+        outputs.append(
+            run_model(llama_config,
+                      use_inductor=use_inductor,
+                      use_compile=True))
+    run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -348,18 +361,32 @@ def test_toy_llama():
             llama_config.num_layers,  # 1 + num_layers
             num_backend_compilations=1 +
             llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=2 *
+            num_cudagraph_captured=2 *
         (1 + llama_config.num_layers
          ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
         outputs.append(
-            run_model(llama_config, use_compile=True, split_attn=True))
-    run_model(tractable_config, use_compile=True, split_attn=True)
+            run_model(llama_config,
+                      use_inductor=use_inductor,
+                      use_compile=True,
+                      split_attn=True))
+    run_model(tractable_config,
+              use_inductor=use_inductor,
+              use_compile=True,
+              split_attn=True)
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
 
 
+def test_toy_llama_inductor():
+    _test_toy_llama(use_inductor=True)
+
+
+def test_toy_no_inductor():
+    _test_toy_llama(use_inductor=False)
+
+
 @torch.inference_mode
 def benchmark():
     from triton.testing import do_bench
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 8e4e0ba8357931a5ab8f48357fa1c638d29c56f6..1e4ee571f1af5b78ecc52ebe3142d06d407cd819 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index b6b45d1cbe880560d74bd2a4894c1c039cbc74c4..dc6cfe9daccdc45d0dbf04ec1ac86b53ef454a02 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import dataclasses
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..57fd1f1c49b12b1eedb52117a5063e570f0b2c0b
--- /dev/null
+++ b/tests/compile/test_config.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm
+from vllm.compilation.counter import compilation_counter
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+
+from .piecewise.test_simple import SillyModel
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_use_cudagraphs(enabled):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=enabled,
+        cudagraph_capture_sizes=[100],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100, device="cuda")
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_cudagraph_captured=1 if enabled else 0,
+    ):
+        # first run is warmup
+        model(inputs)
+        # second run does CUDAGraphs recording (if enabled)
+        model(inputs)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 397517b8665bc3f44342c475c8ddb6cfd549f532..1d000fe00c598064b7f59975d77fc1d48102a715 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 5d38ff91490ee10d35ad16ef757786e06b20ec5f..aade29b99de7e036bcfcc67657dfae05d629fd05 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 509593e7328deb2de7a38a31c1dc5eb3d4d5f455..0c25aae52d465d658afdd2b3e1ce82a40f9823a8 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index b630d0e85d31ac7cc9c0fe0edb6d92e54429e79c..251cc46e9e9899bb0c28588a56f99998cba6a503 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 import pytest
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index 2cd7ebaacec0087f8423fea8a486414757aa11af..c689befdf2da6675da20e869b229c224f1735b39 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 9eae48d60f368c14a063f60eeb5ec0335cd3f20b..df36b86abdbe4b993a2bdc9e4aff0efdd5d3fdc1 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 0934c61135792f8f6b0bff187a69ba6916965937..5e39f6821d16ca69c7303c6fe713db041c3a2e83 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 19c2c62471295cd1c8ae155878c22bb72fe0a1c9..5ec3926bd31f48a26bec10d02a0b75c93b86d4b9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 import tempfile
@@ -311,6 +312,7 @@ class HfRunner:
         dtype: str = "auto",
         *,
         model_kwargs: Optional[dict[str, Any]] = None,
+        trust_remote_code: bool = True,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
@@ -320,10 +322,15 @@ class HfRunner:
 
         self.config = AutoConfig.from_pretrained(
             model_name,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
         )
         self.device = self.get_default_device()
-        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+        self.dtype = torch_dtype = _get_and_verify_dtype(
+            self.model_name,
+            self.config,
+            dtype=dtype,
+            is_pooling_model=is_sentence_transformer or is_cross_encoder,
+        )
 
         model_kwargs = model_kwargs if model_kwargs is not None else {}
         model_kwargs.setdefault("torch_dtype", torch_dtype)
@@ -336,7 +343,7 @@ class HfRunner:
                 model_name,
                 device=self.device,
                 model_kwargs=model_kwargs,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
             )
         elif is_cross_encoder:
             # Lazy init required for AMD CI
@@ -346,12 +353,12 @@ class HfRunner:
                 model_name,
                 device=self.device,
                 automodel_args=model_kwargs,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
             )
         else:
             model = auto_cls.from_pretrained(
                 model_name,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
                 **model_kwargs,
             )
 
@@ -372,7 +379,7 @@ class HfRunner:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
-                trust_remote_code=True,
+                trust_remote_code=trust_remote_code,
             )
 
         # don't put this import at the top level
@@ -381,7 +388,7 @@ class HfRunner:
         self.processor = AutoProcessor.from_pretrained(
             model_name,
             torch_dtype=torch_dtype,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
         )
         if skip_tokenizer_init:
             self.tokenizer = self.processor.tokenizer
diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py
index b7a9863f4aaf58945e90d7fe70dde805f1666abf..6afe98d78ce815379120532dd6635a320ca9d1b1 100644
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index 83259b690337ace1db2445afcc027d6c3febbeca..e2c6c66b259c8e943ddab1e9569f55f18c90b474 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Callable, Optional
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 9e8e315d87b188229fa70bbe5b26131d8f920c0f..f296c81e176853b02a43a82ed1f19fc01b4f29af 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import cycle
 
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 039b5e739892aae30822301e86f0a561adf40af2..3429a858dda59e90c0df1ea47d4778aa5e857cb9 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
index 68d9618ae245be712afeb996deaf1385c05d7c6e..9eed264fd7d43ec7bea29f2e3cd95dd8d18bf12d 100644
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 250c9a7497d23aad27dbd9601283936d1428ce8b..ba085001136be7df01ec09025168f302e1b5a8b0 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
index 20260873003df3ae939d8f4c7e85798f89cb0b27..65400899b811cfcabd88b2e2caa755b29d4d7e73 100644
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index a1414edd95622c2acca84cc400e2d64ca17157a0..795eef6743fd1a45bad0f32e291be8779810585e 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index 4b9454c84ff658f6e03648bebf380420a9ee8934..a31d1c46b37f0d4ab7144abaf744fbefd8a63771 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 50233624f7d17d514688a35229400e7268349fe2..46e224c6f53b2e2a96820bf0bfe80667a1fd62ca 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import random
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 1a20e2c135c2ef40fb0feda5ae4e11e873eb8ac9..375b248ebedaaed26ed19e90f78373d3d528fcc2 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 161b32f01b111c3d608b06bf219e3d7d9a7fe8e0..d4dacc4f1296da2b126fba09d103f79bd65494dc 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index a4a90144482bb55e260998f31414b837fe257a9c..1b958e34df870a1fcc8edb9912c4453806dc4308 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index a5ba16898d89118c9119ac572a25afb762738620..db78a9d55642264755f0d454b8d8adcd65106b37 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import deque
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index c6049b26a2bcdc88fb10a811458517d5ab57fe74..20cc083ec8db47173835c3070cd52f487fbd0c74 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest  # noqa
 
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
index 64b3e148ee728c08da3479e850cdf20e79a8fe08..8281298d6634c9dde9636977f20588a74ca673d0 100644
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import msgspec
 
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 84b0426b470bc9cd012da02bb0db11126773154c..b746c17864641331477281e87c9ebef6bddf5d0a 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import defaultdict
diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py
index 59394b0351bda38a871de9db669618b798e6857d..f2c125355c83c62a9e3bccabd3d7466a7d94abdc 100644
--- a/tests/detokenizer/conftest.py
+++ b/tests/detokenizer/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py
index 14f9babb8d8a6bf188322589b838b9e3c313bae1..ae06a985c7ecdc86cf4a9ca895b5e4cff89fb5d8 100644
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
index e9ad8d1612102b85ad7160a524363493929d3063..bd221977224f90e071effca62f83288d11cb5c22 100644
--- a/tests/detokenizer/test_stop_checker.py
+++ b/tests/detokenizer/test_stop_checker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
index 4b1e4f5cf45e82e73830f2ca8080e08a20747e8c..9716f7d72a58522b833258c3eddd79fbb8858570 100644
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the different finish_reason="stop" situations during generation:
     1. One of the provided stop strings
     2. One of the provided stop tokens
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index 0607dd01a339519acf4b2b8b8009717b07dd822c..efe938a20c4f45971bce83850652133e074423d0 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index ee8f2097933d1dbdbb5b7b0942a04f3faf74b5d6..666a715cc0da114589128000c9c49718df87668f 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 from typing import Optional, Union
 
@@ -12,11 +13,13 @@ from vllm.distributed.kv_events import EventPublisherFactory
 
 from .test_events import SampleBatch
 
+DP_RANK = 0
+
 
 @pytest.fixture
 def random_port():
     """Generate a random port number for testing"""
-    return random.randint(10000, 60000)
+    return random.randint(10000, 59900)
 
 
 @pytest.fixture
@@ -29,21 +32,23 @@ def publisher_config(random_port, request):
         replay_endpoint = endpoint + "-replay"
     else:
         endpoint = f"tcp://*:{random_port}"
-        replay_endpoint = f"tcp://*:{random_port + 1}"
+        replay_endpoint = f"tcp://*:{random_port + 100}"
 
-    return KVEventsConfig(enable_kv_cache_events=True,
-                          publisher="zmq",
-                          endpoint=endpoint,
-                          replay_endpoint=replay_endpoint,
-                          buffer_steps=100,
-                          hwm=1000,
-                          topic="test")
+    return KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint=endpoint,
+        replay_endpoint=replay_endpoint,
+        buffer_steps=100,
+        hwm=1000,
+        topic="test",
+    )
 
 
 @pytest.fixture
 def publisher(publisher_config):
     """Create and return a publisher instance"""
-    pub = EventPublisherFactory.create(publisher_config)
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
     yield pub
     pub.shutdown()
 
@@ -59,7 +64,11 @@ def subscriber(publisher_config):
     if replay_endpoint and replay_endpoint.startswith("tcp://*"):
         replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
 
-    sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic)
+    sub = MockSubscriber(
+        [endpoint],
+        [replay_endpoint] if replay_endpoint else None,
+        publisher_config.topic,
+    )
     yield sub
     sub.close()
 
@@ -67,26 +76,37 @@ def subscriber(publisher_config):
 class MockSubscriber:
     """Helper class to receive and verify published events"""
 
-    def __init__(self,
-                 pub_endpoint: str,
-                 replay_endpoint: Optional[str] = None,
-                 topic: str = "",
-                 decode_type=SampleBatch):
+    def __init__(
+        self,
+        pub_endpoints: Union[str, list[str]],
+        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        topic: str = "",
+        decode_type=SampleBatch,
+    ):
         self.ctx = zmq.Context.instance()
 
-        # Set up subscriber socket
-        self.sub = self.ctx.socket(zmq.SUB)
-        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8'))
-        self.sub.connect(pub_endpoint)
+        # Convert single endpoint to list for consistency
+        if isinstance(pub_endpoints, str):
+            pub_endpoints = [pub_endpoints]
+        if isinstance(replay_endpoints, str):
+            replay_endpoints = [replay_endpoints]
 
-        # Set up replay socket if provided
-        self.replay = None
-        if replay_endpoint:
-            self.replay = self.ctx.socket(zmq.REQ)
-            self.replay.connect(replay_endpoint)
+        # Set up subscriber socket - connect to all endpoints
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8"))
+        for endpoint in pub_endpoints:
+            self.sub.connect(endpoint)
+
+        # Set up replay sockets if provided
+        self.replay_sockets = []
+        if replay_endpoints:
+            for replay_endpoint in replay_endpoints:
+                replay = self.ctx.socket(zmq.REQ)
+                replay.connect(replay_endpoint)
+                self.replay_sockets.append(replay)
 
         self.topic = topic
-        self.topic_bytes = topic.encode('utf-8')
+        self.topic_bytes = topic.encode("utf-8")
         self.received_msgs: list[tuple[int, SampleBatch]] = []
         self.last_seq = -1
         self.decoder = msgspec.msgpack.Decoder(type=decode_type)
@@ -106,25 +126,31 @@ class MockSubscriber:
         self.received_msgs.append((seq, data))
         return seq, data
 
-    def request_replay(self, start_seq: int) -> None:
+    def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
         """Request replay of messages starting from start_seq"""
-        if not self.replay:
-            raise ValueError("Replay socket not initialized")
-
-        self.replay.send(start_seq.to_bytes(8, "big"))
-
-    def receive_replay(self) -> list[tuple[int, SampleBatch]]:
-        """Receive replayed messages"""
-        if not self.replay:
-            raise ValueError("Replay socket not initialized")
-
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
+
+    def receive_replay(self,
+                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+        """Receive replayed messages from a specific replay socket"""
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        replay_socket = self.replay_sockets[socket_idx]
         replayed: list[tuple[int, SampleBatch]] = []
         while True:
             try:
-                if not self.replay.poll(1000):
+                if not replay_socket.poll(1000):
                     break
 
-                frames = self.replay.recv_multipart()
+                frames = replay_socket.recv_multipart()
                 if not frames or not frames[-1]:
                     # End of replay marker
                     break
@@ -141,5 +167,5 @@ class MockSubscriber:
     def close(self):
         """Clean up resources"""
         self.sub.close()
-        if self.replay:
-            self.replay.close()
+        for replay in self.replay_sockets:
+            replay.close()
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index 72e7ebdb7b59478b8d23f9cae6b01d436e8ca819..e2de462612b47e8b1b27402d991ac0d3c6850d4e 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # can only run on machines with p2p access across GPUs
 # can only run with torchrun:
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 8f4c3537e1586d098c8c0d872d2e09d16de0a3a5..e2cb579e22dc4167852af6b52a9811fb229c67c6 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the communication operators.
 
 Run `pytest tests/distributed/test_comm_ops.py`.
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index a7ba45c9e546ea5b0b7755fdb3a62a9698c0c769..fae49c41d5f83a06eaeb4011618ce1f1b2a25e1e 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index 4b0c65d1d3a47fa66c65bd3604a9dd8ca510a0e8..b93696e4be0e1ed1405e971dfabc250061fcaf98 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ..entrypoints.openai.test_oot_registration import (
     run_and_test_dummy_opt_api_server)
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index 8de1aa20eabd01d8a588a55cd6fe158ebfc1a815..8be9ee0a1889dab1f60a90245bc285b6f83d5044 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
 import time
 
@@ -8,6 +9,8 @@ import pytest
 from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
                                         NullEventPublisher)
 
+DP_RANK = 0
+
 
 class EventSample(
         msgspec.Struct,
@@ -120,7 +123,7 @@ def test_topic_filtering(publisher_config):
     publisher_config.replay_endpoint = None
 
     publisher_config.topic = "foo"
-    pub = EventPublisherFactory.create(publisher_config)
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
 
     from .conftest import MockSubscriber
     sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
@@ -184,9 +187,72 @@ def test_high_volume(publisher, subscriber):
 
 def test_null_publisher():
     """Test that NullEventPublisher can be used without errors"""
-    publisher = NullEventPublisher()
+    publisher = NullEventPublisher(DP_RANK)
 
     # This should not raise any errors
     batch = create_test_events(5)
     publisher.publish(batch)
     publisher.shutdown()
+
+
+def test_data_parallel_rank_tagging(publisher_config):
+    """Test that events are properly tagged with their data parallel rank"""
+
+    publisher_config.topic = "foo"
+    pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK)
+    pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1)
+
+    # Hardcode the expected endpoints based on port offsetting behavior
+    # Both ranks get offsets according to _offset_endpoint_port function
+    base_endpoint = publisher_config.endpoint
+    if "tcp://" in base_endpoint:
+        # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
+        expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
+        expected_endpoint_1 = base_endpoint.replace(
+            ":5557", ":5558")  # rank 1 gets port + 1
+    else:
+        # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
+        expected_endpoint_0 = base_endpoint  # rank 0 gets base
+        expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1
+
+    from .conftest import MockSubscriber
+    sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
+    sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
+
+    try:
+        time.sleep(0.1)  # Let publishers start up
+
+        # Publish events from different ranks
+        batch_0 = create_test_events(2)
+        batch_1 = create_test_events(3)
+
+        pub_0.publish(batch_0)
+        pub_1.publish(batch_1)
+
+        # Receive events from rank 0
+        result_0 = sub_0.receive_one(timeout=200)
+        assert result_0 is not None, "No message received from rank 0"
+        seq_0, received_0 = result_0
+
+        # Receive events from rank 1
+        result_1 = sub_1.receive_one(timeout=200)
+        assert result_1 is not None, "No message received from rank 1"
+        seq_1, received_1 = result_1
+
+        # Verify DP rank tagging
+        assert received_0.data_parallel_rank == 0, (
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+        assert received_1.data_parallel_rank == 1, (
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+
+        # Verify event content is correct
+        assert len(
+            received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(
+            received_1.events) == 3, "Wrong number of events from rank 1"
+
+    finally:
+        pub_0.shutdown()
+        pub_1.shutdown()
+        sub_0.close()
+        sub_1.close()
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index db82816178030a39250effc4320c7e9bb85c8bb9..f641bf1604145bf9b23ff6e693304429d06807a5 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Literal, NamedTuple, Optional
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index c86d2d8a0061a415717a2eeae358f86ac93b4bf8..ef17a51fff0e1f8367cf7ad736c4b97ec1557ca9 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure ray assigns GPU workers to the correct node.
 
 Run:
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5346d67b10d16d7cc947239accb07250573be2db..7d569fd83821d854bab6f4e4390b45cc50d32746 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
@@ -227,6 +228,7 @@ MULTIMODAL_MODELS = {
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
     "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
     "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
+    "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
     "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
     "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 7bf93f270148ba4da69a25a8a36b23aaaf0099d5..69ceedd345a89d967c450051da3e8f8cdb0940a5 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 3ca6e7b33a5eeff9e00101cdb428bd9e283d842c..a027a9e37dd6770db3435a7eaf799317511c5f98 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 2c323edfa2af2d9879c88b7d58f2249e7487e583..5b32b90f3cfec3b9f8e33d882b43d09483ea608e 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import os
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 9b1bbd6e545c1b505a1a2db6ff0394afb345e591..94ad8f4f1213a9ac1ca2a4e1a01f7b2324b0d36b 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index c9eba2b43788ecbc4c91f9b66a51dc5af2798277..91a594eac5c4257bd32abdfbac5ed9e54aefd75c 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index f9eacc11d75f80cd7de1bea1e1c85b3dcac517d2..e1357b4a34e99b7c3449fea15bca247799a5de4a 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import random
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index bb38e908b7345bcfa791a105d2e246ae7b7294f4..9f2c3eaec3597978a12f833c9c889af57234fb5f 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # unit test for `examples/offline_inference/torchrun_example.py`
 import os
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 4432950f274e023cb5173bc4ca016785f0850c4d..0287ad94e388624fc811fcfb4d85f93bdb1f5bda 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import socket
 
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index 0f46fba3ac49fc34e7841f168c124cebd6a5f22c..8b99d9d6e21fbe1eec439c8830c9753753cbf00f 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """E2E tests to verify the correctness of the encoder-decoder framework
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
index 1a20e2c135c2ef40fb0feda5ae4e11e873eb8ac9..375b248ebedaaed26ed19e90f78373d3d528fcc2 100644
--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 05d9cfc7ab747191b3413892439d9991c905b724..cfbc7c245ffd41db2b4716837647d719c47a1af9 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from argparse import ArgumentError, ArgumentTypeError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import Annotated, Literal, Optional
 
 import pytest
 
 from vllm.config import CompilationConfig, config
 from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
-                                   get_type, is_not_builtin, is_type,
-                                   literal_to_kwargs, nullable_kvs,
+                                   get_type, get_type_hints, is_not_builtin,
+                                   is_type, literal_to_kwargs, nullable_kvs,
                                    optional_type, parse_type)
 from vllm.utils import FlexibleArgumentParser
 
@@ -159,6 +160,18 @@ def test_is_not_builtin(type_hint, expected):
     assert is_not_builtin(type_hint) == expected
 
 
+@pytest.mark.parametrize(
+    ("type_hint", "expected"), [
+        (Annotated[int, "annotation"], {int}),
+        (Optional[int], {int, type(None)}),
+        (Annotated[Optional[int], "annotation"], {int, type(None)}),
+        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+    ],
+    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
+def test_get_type_hints(type_hint, expected):
+    assert get_type_hints(type_hint) == expected
+
+
 def test_get_kwargs():
     kwargs = get_kwargs(DummyConfig)
     print(kwargs)
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index 049fa2c8b12bd5418a0d97e5c8347f4fd4c0cad0..ac5a1f957dfe4fb3ee62206f082294f5b2d72b12 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 91c9ba4a74e6206307d705229158eb2238da3783..15c7a97b50e1f52b74d1612b0694b459e2d535d9 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
index b67dd86bfdf0b5fc0044af3f26a00ede074e603d..458f4deb743ac4310f4ac56da3fb1df67c6f220d 100644
--- a/tests/engine/test_multi_step_output_processor.py
+++ b/tests/engine/test_multi_step_output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import MagicMock
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index 9b2f45def6c54c9c3a8aff446872ac08c48da644..b5381b61a020a59f9d5921f90f0f17ca83f50aad 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
index 0cf4f69d56a8703430a39d7fe944e3e5692eafad..fc6a78a5112a140a846b3e23e971845ba2fddf31 100644
--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import nullcontext
 
 import pytest
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index b29d6362f571baebd54ef4880757a56aea9b0f57..9c62761d78afbe175f302f77c2d709309440f7ed 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 3b596ea3e6a0d7c2bb2e6fff193bb173cb4f3d45..a7c533ec24198c03ad1ca9a865d18c16c99f2174 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 95657455bd7bbc305633475a6909a0c45fd3de94..a2d35486a5e8179b1fe534d5b1c0fd36fe062869 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 742a66683445736ba416d0bf4ef40f1b2092b1b6..97cf3b5ce8fcb1016c7efcb78b4a4312c77255d1 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import weakref
 
 import pytest
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 6470249dddbcf7958acb7ea24acaacc8a7c7b050..3a13f8c979f23b038da8ae25dca505b9389a016c 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index d10257761c86160c5ce5ce94a27a127e08be1710..f0fa54aa3131cc6bb866a45f11d678a75e82ef97 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 9a895c922cc391c62b7a4c2b52aa75c8cc651a56..3c3281c34d5690a7e1ab0700fc2da87373d04854 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
@@ -24,6 +25,12 @@ TOKEN_IDS = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
@@ -103,3 +110,19 @@ def test_multiple_sampling_params(llm: LLM):
     # sampling_params is None, default params should be applied
     outputs = llm.generate(PROMPTS, sampling_params=None)
     assert len(PROMPTS) == len(outputs)
+
+
+def test_max_model_len():
+    max_model_len = 20
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    sampling_params = SamplingParams(max_tokens=max_model_len + 10)
+    outputs = llm.generate(PROMPTS, sampling_params)
+    for output in outputs:
+        num_total_tokens = len(output.prompt_token_ids) + len(
+            output.outputs[0].token_ids)
+        assert num_total_tokens == max_model_len
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 099af0f36088b2dbb3c8293b87536440773db36c..b7d53e31fd71b3126b5ae47f782bb5bfd48944e0 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index c2b4a935886ba1c638b9502a43c3e8f390664da3..533da9e6d6eac403fdadaf6dc79de3c479f60017 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index dd5d17885eb91e7da52cb1a766b7aca46a73e5a6..d41b0a436c62db0804d9f6ca0b5a8b81dec77553 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import weakref
diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py
deleted file mode 100644
index 925bf56a93402efb4c9dc70eff0e8144d0bcf36c..0000000000000000000000000000000000000000
--- a/tests/entrypoints/llm/test_init.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from vllm import LLM
-
-from ...utils import error_on_warning
-
-MODEL_NAME = "facebook/opt-125m"
-
-
-def test_pos_args_deprecated():
-    with error_on_warning(DeprecationWarning):
-        LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
-
-    with error_on_warning(DeprecationWarning):
-        LLM(MODEL_NAME, tokenizer=MODEL_NAME)
-
-    with pytest.warns(DeprecationWarning, match="'tokenizer'"):
-        LLM(MODEL_NAME, MODEL_NAME)
-
-    with pytest.warns(DeprecationWarning,
-                      match="'tokenizer', 'tokenizer_mode'"):
-        LLM(MODEL_NAME, MODEL_NAME, "auto")
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index f065f6564cd2f52050c9457b4b88a519f0a28fae..61b6b4fbf8e35eab20e1e6cb1cad0be3243b2679 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 from contextlib import nullcontext
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 665c6ea1e69948962f5a7a2040f58c1495acfe07..1b7be15d5d6913aff0ec907be71f9a5e07e9f95f 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 23fd72f4ebbb92bfc38fef24908a65f7fa459076..a606eeab5887eb6db3e28df55cc1ef2b83eed747 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index d3948e2ed575ed5f61e0c5524a4b5e597a0079e4..41b70f80e3b83d2a54bfc5e0ff5b7093be4e7452 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py
index ebf2f829b583b6b989043ea20ca86a4037c9545e..437c485113520dddce8dae0f6efef24f347e8517 100644
--- a/tests/entrypoints/openai/correctness/test_mteb.py
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
 
 from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
+                                                      MTEB_EMBED_TOL,
                                                       OpenAIClientMtebEncoder,
                                                       run_mteb_embed_task,
                                                       run_mteb_embed_task_st)
@@ -38,4 +40,4 @@ def test_mteb(server):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 642c204b9ff001f0d813e7edd4581be3283027f8..58195f98bd3510432c06370c39a64793b57819c5 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Evaluate Transcription API correctness by computing Word Error Rate (WER)
 on a given ASR dataset. When provided, it will also compare the WER against
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index 1f7ba0da4f24634b44ddd9ae76bf97935b009b1c..ab3c809054384049d5a32b99020d4dfad0e92964 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import contextlib
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 7f959f31201911957d7280817276e2765138bc07..d67c05ab3e8de4f7634f8f2cdbc523d12e11d1e6 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index a4ac80070773446ea971598e6702de33633421af..a55941976cd826acf41586c9dcfda3bfba6bd0bc 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from http import HTTPStatus
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 2509ef0d280a2a1252d0d97161609fe1654f566e..dab947b21b284f00e4d38fa16e58db376f8f2d28 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import json
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 86ee17c6f449166dd969a049e60ff4e9b807e5cf..de63f4ed218b60fb0de3218fbc888ae13c56a3fb 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import NamedTuple
 
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index 9dab524ea4801a702d5dfcf6cb52929b0e0f8b4d..e9d1a855294cbdc1f9ec76f5b57e1504b68d729e 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index f18fbb0a9c71153de56d04872d06fc77d2515b41..daa4a78c935a7e96a2b1358d09631b602635d719 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index e00f001ef730d925c51dc54b698db6beff807d9e..03730b67283c409a72301e8268346a085a6ff68e 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 0419395f1816ba47555b2e7c1cf9dbfcf5ba09c3..3c8ed955a65a27d52c020443a4c365ec201f0cd9 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 97124c85e0d33ff6bf3fb98f8f3ecc6328a5553f..6d5f925152c3c9aa9def8796b6443448286437b8 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import requests
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 8d1abe28a027ad7bfa2282036fe4345ae660d540..504fd72aa4ae226063ab623a5413d44111a4f56b 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 9d12f27a2b879cff2dc628c76d75b2a59dce6726..7e54143f6e1c368d612cab3803e4b40283c4710a 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for guided decoding tests
 import json
 import shutil
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index dad76b54c5e99ab9894803e546d465a6f324576e..84ad7a09165ad3b0ad376d3dffe25a07e0a5cf48 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
@@ -8,7 +9,7 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -21,7 +22,9 @@ def server():  # noqa: F811
         "--guided-decoding-backend",
         "xgrammar",
         "--tool-call-parser",
-        "hermes"
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -36,7 +39,12 @@ async def client(server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+@pytest.mark.parametrize("enable_thinking", [True, False])
+async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
+                                 stream: bool, tool_choice: str,
+                                 enable_thinking: bool):
     tools = [
         {
             "type": "function",
@@ -125,30 +133,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
             "forecast for the next 5 days, in fahrenheit?",
         },
     ]
-
-    # Non-streaming test
-    chat_completion = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-    )
-
-    assert chat_completion.choices[0].message.tool_calls is not None
-    assert len(chat_completion.choices[0].message.tool_calls) > 0
-
-    # Streaming test
-    stream = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-        stream=True,
-    )
-
-    output = []
-    async for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.tool_calls:
-            output.extend(chunk.choices[0].delta.tool_calls)
-
-    assert len(output) > 0
+    if not stream:
+        # Non-streaming test
+        chat_completion = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": enable_thinking
+                }
+            })
+
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+    else:
+        # Streaming test
+        output_stream = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": enable_thinking
+                }
+            })
+
+        output = []
+        async for chunk in output_stream:
+            if chunk.choices and chunk.choices[0].delta.tool_calls:
+                output.extend(chunk.choices[0].delta.tool_calls)
+
+        assert len(output) > 0
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index b7ee3e33c2d250fd3ac3f5f2cfb6f1f9cf19a096..00d3ffb61ee9f995367aa15fa58c51192a9b6632 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 import io
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 1019bfd589362eb6205eb92e0e34cef4452b9715..80640a2e1a8bce0717e6f795045075cfb88e3916 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 
@@ -11,7 +12,8 @@ import requests
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.utils import run_embedding_correctness_test
+from ...models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 332fa332a4a41f90435c11c95384783f7ba712ad..08b797dc57ad270742b03b99ff379cc0fcc91104 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
@@ -11,7 +12,9 @@ import pytest
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 from ...conftest import HfRunner
-from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
+from ...models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+from ...models.utils import EmbedModelInfo
 from ...utils import RemoteOpenAIServer
 
 MODELS = [
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 52b4df9ceecd734ab06fb28416a10867f70dd0fe..9c2aef23e87722aa97dc57315d8bf5e14305aeae 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 2fc08b47513e687864b6ff33fe68520ac49281a1..bcdeaaacedea0e03ec77187c0eb921c249e4065a 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
@@ -313,3 +314,37 @@ async def test_loading_invalid_adapters_does_not_break_others(
         prompt=["Hello there", "Foo bar bazz buzz"],
         max_tokens=5,
     )
+
+
+@pytest.mark.asyncio
+async def test_beam_search_with_lora_adapters(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    zephyr_lora_files,
+):
+    """Validate that async beam search can be used with lora."""
+
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+                extra_body=dict(use_beam_search=True),
+            )
+
+    lora_tasks = []
+    for i in range(3):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index c96151349eb3f7d3f99f9b0edce4b64f33fef6ee..d4afdf7751c8f57ce392287a95f25052914311ec 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
 from dataclasses import dataclass, field
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 42f7b098f917de39f7713c05a30523489c9264a4..2d7b845736b8737516c89b885bbc9e5edefc6c85 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import subprocess
 import sys
@@ -171,10 +172,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
 EXPECTED_METRICS = [
     "vllm:num_requests_running",
-    "vllm:num_requests_swapped",  # deprecated
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
-    "vllm:cpu_cache_usage_perc",  # deprecated
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
@@ -274,10 +273,7 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
-HIDDEN_DEPRECATED_METRICS = [
-    "vllm:num_requests_swapped",
-    "vllm:cpu_cache_usage_perc",
-]
+HIDDEN_DEPRECATED_METRICS: list[str] = []
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 3d4f1cde27895d20f46b7313e4066ec3f6a40e3a..1980daa80db9e5b97b578751bc972ef8160671c5 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index a1b7a205a4575b1347857a156515f1f86abba69e..f0ce50debe494224eb7562483e2bfc0f7804b9ec 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ...utils import VLLM_PATH, RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index cae2a3b59553da17e3d93be0629eafbe3f194472..4ded37595384ebb0921caa90c65a488ae1180a0f 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Final
 
 import pytest
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 72ab12c564602a778244a916bed86989ae0e082d..cf16ace6537acd70822035a892012deac1c0aad1 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index e384915899d3d70c5b5eed2e9e5258903c91eed0..ff0730c77032cf8b437dbe419375af0298195472 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import openai
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index ba11cd3a29a8ebeb35068a89bc21eb14bee02129..19eba320c279583b1129b86cd8ac3d1e55127ae1 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import requests
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 6474858642d78ad316c5500153106d3c3b80506d..099062e55c7298a271f216edfae7dec1f72be2fa 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Separate these tests out from test_completion and test_chat, because they
 # require launching a second server with a different flag. Running both servers
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 106d6b2c14f834f156b2cb486ba5b2f324ca03cb..7b4966848b9dea935a540e824e28933f40eacb0d 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import os
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 643d0d06abcb8bcccd9c984b145b07cd2febda20..e23f41e983b0d2b0b2e5a032fb6bf514fe21ee50 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import subprocess
-import sys
 import tempfile
 
+import pytest
+
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
 
 # ruff: noqa: E501
@@ -24,9 +26,13 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 
-INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
 
+INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
 
 def test_empty_file():
     with tempfile.NamedTemporaryFile(
@@ -35,9 +41,8 @@ def test_empty_file():
         input_file.write("")
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "intfloat/multilingual-e5-small"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "intfloat/multilingual-e5-small"
         ], )
         proc.communicate()
         proc.wait()
@@ -54,9 +59,8 @@ def test_completions():
         input_file.write(INPUT_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "NousResearch/Meta-Llama-3-8B-Instruct"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
         ], )
         proc.communicate()
         proc.wait()
@@ -79,9 +83,8 @@ def test_completions_invalid_input():
         input_file.write(INVALID_INPUT_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "NousResearch/Meta-Llama-3-8B-Instruct"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
         ], )
         proc.communicate()
         proc.wait()
@@ -95,9 +98,8 @@ def test_embeddings():
         input_file.write(INPUT_EMBEDDING_BATCH)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
-            input_file.name, "-o", output_file.name, "--model",
-            "intfloat/multilingual-e5-small"
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
+            "--model", "intfloat/multilingual-e5-small"
         ], )
         proc.communicate()
         proc.wait()
@@ -110,16 +112,17 @@ def test_embeddings():
             BatchRequestOutput.model_validate_json(line)
 
 
-def test_score():
+@pytest.mark.parametrize("input_batch",
+                         [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+def test_score(input_batch):
     with tempfile.NamedTemporaryFile(
             "w") as input_file, tempfile.NamedTemporaryFile(
                 "r") as output_file:
-        input_file.write(INPUT_SCORE_BATCH)
+        input_file.write(input_batch)
         input_file.flush()
         proc = subprocess.Popen([
-            sys.executable,
-            "-m",
-            "vllm.entrypoints.openai.run_batch",
+            "vllm",
+            "run-batch",
             "-i",
             input_file.name,
             "-o",
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index b373f29127524a253d6ca84f2e3259e0fd106933..af51a0a3eeebfe2b452b631103fa0f2ebe1eeaab 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import pytest
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 5e11af8cf89294ae49f3138a65873bf1bb59b7ff..94740fefc870ee049a0c3bbc990dc7c9bfd16825 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from contextlib import suppress
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index e8f3c2f8b39ee67105400e3546572900947d30a2..28af6489a4d0a7f41492ff638d2a4fb7e2270d9d 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
 from unittest.mock import MagicMock
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 0f12ac9b260be5057f92c9651ec108351339b024..29a94c852bba6fa40af2263f9069f3ce33a19c00 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 3ca8a9a410ffd9c51f8bd6e04ae0d9918f9fa30b..0dd6af17ef22752dbb7a7145a1ee496e56311f8f 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
 
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index f1ab7223048db1e9b42451596abb365961a45fa0..e143150356d92bfa58531f109db551fe060e5599 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import json
 import tempfile
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 9773f3e45b99c452d3432b9c44e9a44221cada4c..57dd25fe1b16452dc7f5d0fbf49bb57a599473e6 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
@@ -76,11 +77,11 @@ async def test_tokenize_completions(
                                  })
         response.raise_for_status()
 
-        assert response.json() == {
-            "tokens": tokens,
-            "count": len(tokens),
-            "max_model_len": 8192
-        }
+        result = response.json()
+        assert result["tokens"] == tokens
+        assert result["count"] == len(tokens)
+        assert result["max_model_len"] == 8192
+        assert result["token_strs"] is None
 
 
 @pytest.mark.asyncio
@@ -138,11 +139,11 @@ async def test_tokenize_chat(
                                          })
                 response.raise_for_status()
 
-                assert response.json() == {
-                    "tokens": tokens,
-                    "count": len(tokens),
-                    "max_model_len": 8192
-                }
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
 
 
 @pytest.mark.asyncio
@@ -215,11 +216,46 @@ async def test_tokenize_chat_with_tools(
                 )
                 response.raise_for_status()
 
-                assert response.json() == {
-                    "tokens": tokens,
-                    "count": len(tokens),
-                    "max_model_len": 8192,
-                }
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_with_return_token_strs(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+
+    prompt = "This is a token_strs test prompt! vllm1"
+    response = requests.post(
+        server.url_for("tokenize"),
+        json={
+            "prompt": prompt,
+            "model": model_name,
+            "return_token_strs": True
+        },
+    )
+    response.raise_for_status()
+
+    tokens = tokenizer.encode(prompt, add_special_tokens=True)
+    tokens_str = tokenizer.convert_ids_to_tokens(tokens)
+
+    result = response.json()
+    assert result["tokens"] == tokens
+    assert result["count"] == len(tokens)
+    assert result["max_model_len"] == 8192
+    assert result["token_strs"] == tokens_str
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 5c48df3cebbc254dfb85b4de620a20d2b31d6176..1cb0a39df5139b22a456860e490fbcbbe62ee798 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import io
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index 137ed9db85891ba93613078ac3e08b6ab308e20e..b33a26af65b33fe9cafa5f6872910667c5fcb670 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import openai
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 53f057a294c0a4b84690d7c418e51df0836e584f..990ea3579291d811f7d5c1aac1d1a4a23961f164 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 1ab50b41c7ecbf32cdd8bd99f51770afe6e813fa..4513d8b3420f4669886b3c0f799befca11f2f10f 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 26c68e06c199f12be02cfb682dd5616be0668770..fe982e286ae47d865f84710381d3e7abb5ea92ec 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index 92ba1376e200236b49960b36faa8acd5f948b90a..8c86b4889e15b20aa38c3937060229c77567ce0c 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -191,3 +192,27 @@ def test_streaming_tool_call_with_large_steps():
     assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
     assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
     assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index fbbbc1fb2a596864352b6e691346add2a751b224..d83137472598e1bf34350b105b8d8e95e18506fa 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -159,3 +160,27 @@ def test_streaming_tool_call_with_large_steps():
     assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
     assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
     assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index ab8f4bd678fdfd6d0cb4a902fa433806811b3751..e1b41f45f55480962e7f15eca534c5c541a4089b 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Union
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4af60a782651e73c865ae4205fb923b1cbd3963
--- /dev/null
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import socket
+import threading
+import time
+from typing import Optional
+from unittest.mock import patch
+
+import pytest
+
+from vllm.v1.utils import (APIServerProcessManager,
+                           wait_for_completion_or_failure)
+
+# Global variables to control worker behavior
+WORKER_RUNTIME_SECONDS = 0.5
+
+
+# Mock implementation of run_api_server_worker
+def mock_run_api_server_worker(listen_address, sock, args, client_config=None):
+    """Mock run_api_server_worker that runs for a specific time."""
+    print(f"Mock worker started with client_config: {client_config}")
+    time.sleep(WORKER_RUNTIME_SECONDS)
+    print("Mock worker completed successfully")
+
+
+@pytest.fixture
+def api_server_args():
+    """Fixture to provide arguments for APIServerProcessManager."""
+    sock = socket.socket()
+    return {
+        "target_server_fn":
+        mock_run_api_server_worker,
+        "listen_address":
+        "localhost:8000",
+        "sock":
+        sock,
+        "args":
+        "test_args",  # Simple string to avoid pickling issues
+        "num_servers":
+        3,
+        "input_addresses": [
+            "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
+            "tcp://127.0.0.1:5003"
+        ],
+        "output_addresses": [
+            "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
+            "tcp://127.0.0.1:6003"
+        ],
+        "stats_update_address":
+        "tcp://127.0.0.1:7000",
+    }
+
+
+@pytest.mark.parametrize("with_stats_update", [True, False])
+def test_api_server_process_manager_init(api_server_args, with_stats_update):
+    """Test initializing the APIServerProcessManager."""
+    # Set the worker runtime to ensure tests complete in reasonable time
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 0.5
+
+    # Copy the args to avoid mutating the
+    args = api_server_args.copy()
+
+    if not with_stats_update:
+        args.pop("stats_update_address")
+    manager = APIServerProcessManager(**args)
+
+    try:
+        # Verify the manager was initialized correctly
+        assert len(manager.processes) == 3
+
+        # Verify all processes are running
+        for proc in manager.processes:
+            assert proc.is_alive()
+
+        print("Waiting for processes to run...")
+        time.sleep(WORKER_RUNTIME_SECONDS / 2)
+
+        # They should still be alive at this point
+        for proc in manager.processes:
+            assert proc.is_alive()
+
+    finally:
+        # Always clean up the processes
+        print("Cleaning up processes...")
+        manager.close()
+
+        # Give processes time to terminate
+        time.sleep(0.2)
+
+        # Verify all processes were terminated
+        for proc in manager.processes:
+            assert not proc.is_alive()
+
+
+@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
+       mock_run_api_server_worker)
+def test_wait_for_completion_or_failure(api_server_args):
+    """Test that wait_for_completion_or_failure works with failures."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 1.0
+
+    # Create the manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        assert len(manager.processes) == 3
+
+        # Create a result capture for the thread
+        result: dict[str, Optional[Exception]] = {"exception": None}
+
+        def run_with_exception_capture():
+            try:
+                wait_for_completion_or_failure(api_server_manager=manager)
+            except Exception as e:
+                result["exception"] = e
+
+        # Start a thread to run wait_for_completion_or_failure
+        wait_thread = threading.Thread(target=run_with_exception_capture,
+                                       daemon=True)
+        wait_thread.start()
+
+        # Let all processes run for a short time
+        time.sleep(0.2)
+
+        # All processes should still be running
+        assert all(proc.is_alive() for proc in manager.processes)
+
+        # Now simulate a process failure
+        print("Simulating process failure...")
+        manager.processes[0].terminate()
+
+        # Wait for the wait_for_completion_or_failure
+        # to detect and handle the failure
+        # This should trigger it to terminate all other processes
+        wait_thread.join(timeout=1.0)
+
+        # The wait thread should have exited
+        assert not wait_thread.is_alive()
+
+        # Verify that an exception was raised with appropriate error message
+        assert result["exception"] is not None
+        assert "died with exit code" in str(result["exception"])
+
+        # All processes should now be terminated
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(), f"Process {i} should not be alive"
+
+    finally:
+        manager.close()
+        time.sleep(0.2)
+
+
+@pytest.mark.timeout(30)
+def test_normal_completion(api_server_args):
+    """Test that wait_for_completion_or_failure works in normal completion."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 0.1
+
+    # Create the manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        # Give processes time to terminate
+        # wait for processes to complete
+        remaining_processes = manager.processes.copy()
+        while remaining_processes:
+            for proc in remaining_processes:
+                if not proc.is_alive():
+                    remaining_processes.remove(proc)
+            time.sleep(0.1)
+
+        # Verify all processes have terminated
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(
+            ), f"Process {i} still alive after terminate()"
+
+        # Now call wait_for_completion_or_failure
+        # since all processes have already
+        # terminated, it should return immediately
+        # with no error
+        wait_for_completion_or_failure(api_server_manager=manager)
+
+    finally:
+        # Clean up just in case
+        manager.close()
+        time.sleep(0.2)
+
+
+@pytest.mark.timeout(30)
+def test_external_process_monitoring(api_server_args):
+    """Test that wait_for_completion_or_failure handles additional processes."""
+    global WORKER_RUNTIME_SECONDS
+    WORKER_RUNTIME_SECONDS = 100
+
+    # Create and start the external process
+    # (simulates local_engine_manager or coordinator)
+    spawn_context = multiprocessing.get_context("spawn")
+    external_proc = spawn_context.Process(target=mock_run_api_server_worker,
+                                          name="MockExternalProcess")
+    external_proc.start()
+
+    # Create the class to simulate a coordinator
+    class MockCoordinator:
+
+        def __init__(self, proc):
+            self.proc = proc
+
+        def close(self):
+            if self.proc.is_alive():
+                self.proc.terminate()
+                self.proc.join(timeout=0.5)
+
+    # Create a mock coordinator with the external process
+    mock_coordinator = MockCoordinator(external_proc)
+
+    # Create the API server manager
+    manager = APIServerProcessManager(**api_server_args)
+
+    try:
+        # Verify manager initialization
+        assert len(manager.processes) == 3
+
+        # Create a result capture for the thread
+        result: dict[str, Optional[Exception]] = {"exception": None}
+
+        def run_with_exception_capture():
+            try:
+                wait_for_completion_or_failure(api_server_manager=manager,
+                                               coordinator=mock_coordinator)
+            except Exception as e:
+                result["exception"] = e
+
+        # Start a thread to run wait_for_completion_or_failure
+        wait_thread = threading.Thread(target=run_with_exception_capture,
+                                       daemon=True)
+        wait_thread.start()
+
+        # Terminate the external process to trigger a failure
+        time.sleep(0.2)
+        external_proc.terminate()
+
+        # Wait for the thread to detect the failure
+        wait_thread.join(timeout=1.0)
+
+        # The wait thread should have completed
+        assert not wait_thread.is_alive(
+        ), "wait_for_completion_or_failure thread still running"
+
+        # Verify that an exception was raised with appropriate error message
+        assert result["exception"] is not None, "No exception was raised"
+        error_message = str(result["exception"])
+        assert "died with exit code" in error_message, \
+            f"Unexpected error message: {error_message}"
+        assert "MockExternalProcess" in error_message, \
+            f"Error doesn't mention external process: {error_message}"
+
+        # Verify that all API server processes were terminated as a result
+        for i, proc in enumerate(manager.processes):
+            assert not proc.is_alive(
+            ), f"API server process {i} was not terminated"
+
+    finally:
+        # Clean up
+        manager.close()
+        mock_coordinator.close()
+        time.sleep(0.2)
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 9f1f2321d9e648dabdb291997ae97d850fab1be3..49294664275a0e528d38f11bf265f18f1d97b2bb 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py
index 23ce7a679f3eabdb60965e6b6cd6480ba342e012..33ad2cfd3a33aaa33cee952dd280bd868ee6860c 100644
--- a/tests/entrypoints/test_ssl_cert_refresher.py
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import tempfile
 from pathlib import Path
diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
index 184bee2a7153a6cda207ed7ebbbafa3e97be4560..1b95bf59f67c6e49696007728b55519364fd7e94 100644
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
 from vllm.config import LoadFormat
diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py
index 8772035af502fafe929adeece52d2108a4f84629..78d23acfec7c564e94e88c0630bf3de2e64f422b 100644
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import tempfile
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 97ceffab4eb88af55eb5ead21c4efca28a6441f0..9d65159bf64fe11f212fb25da6cf671d7ded72e9 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py
index 4f04ec94753297d80e1aaddf0db5054fce8ad139..88a2fb62b2540f7ff06c7d32a9657e416912ef42 100644
--- a/tests/kernels/attention/conftest.py
+++ b/tests/kernels/attention/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index d9f956fbc7c00cf8a4a23da38aa259e943f51067..2d381a99be60c4523a435adefcceefbdaa6e5c3a 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 58da01f0ebbf3f08bacc2a0f39299c114d645d72..f3e64155703c20947bdba6812d7b5326af0f6a8f 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
@@ -84,7 +85,10 @@ def test_env(
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            block_size, False)
-            assert backend.get_name() == "TORCH_SDPA"
+            if use_v1:
+                assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            else:
+                assert backend.get_name() == "TORCH_SDPA"
 
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
index 82d038257575c5417d9f9762be28017726540481..9aee818c99569714a48f41824ada9abfa01e9dc8 100644
--- a/tests/kernels/attention/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 2f2212dd2b0e05b810077e2a859e6d4e9359ed85..e508505c2b05dbeee8c7ebd8e5304d203586c86f 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index d6570e6334b163bf05159650183baba851901db6..1e7e7e0a7f84b990b405c3a7b90d539bd7e58357 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
index c8ee46bc65d4d7fde9bece94bcaaf4f832856644..c6ce7b0cce40dbedf36942da2e548e390abb9cbf 100644
--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests:
 
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 88516b75cde2b81c77ace3254bae76923fa5b782..bd3190d09b0fa4003ba7a2bb0bc8993a0e000bf0 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 5ad1137aa6af7e42fdc7d48aaf05d16db4cce656..3ad6e1d32911b448a1b1e5a3deb7a739cee120fb 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 0d51a8e7fee19a0271f68448dd83db5913304366..21b08e45fd6fdb750719b55cb2634517682b967f 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -1,5 +1,6 @@
 # Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
 
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index fbad52987dd2bbb632da7a411f1305919bd73c97..de45ee1ed5ccac3fd084f698d111bb2e549fc841 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 7038fbea5c22eb20be3205746f8e751eea760baa..9d1a301ebe304f4880129449d57ae98220daf513 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 5a18b7916f0f69caee074032db2f580544134537..53c37554b15a344e26bdf230d5924075251cf4f1 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test:
 
diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
index 8cebe32c4c5bb465756bcbc64d1f17c72d5d0de8..5a7480a6beaeadf949669bb3e4aaac6c3b5ad393 100644
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 9333777d38ea036de22f6965bfe9ad2230e454be..b09e1bbc427946612d66d325e6a7e1cc6c67a7b3 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import random
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 6ffe27abf709e992de426eb7142e793bc086c740..ed58880cc9e6cad51c5585b41d18b620e802172c 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index fd3c9fa4196a7260a97cfb4bcb62a073d529dc71..358b374ea75bc4e6afdebad03b398d3af8e59a5c 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 4e15d00255a4ff3f7b8b8fefc99c05cca46e7439..0cb7f5963c79b74b85c333abb483c5a32a6690d6 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
@@ -13,7 +14,9 @@ HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 
 DTYPES = [torch.float16, torch.bfloat16]
-QDTYPES = [None, torch.float8_e4m3fn]
+QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
+    None, torch.float8_e4m3fnuz
+]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 79f838a954e70f8fe26488883358808ac400ad0d..29c5e70a8ba85219877b785fb2d34b8a4f6bd84b 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 7a591f53678343be1d715c58f7d17f1ed5eef599..19703b8a2f978fa7757623b0f29fc584cd8b48f6 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index fa4bbe458645f83d59e9d0a7b324138d76aec970..3eac062738f807cc20003c383f51bb723dc56478 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
index c9a9679c5d80f4dfa67a6589d481063706711bfb..40ced08b933a7171b40d348e5fa3bd8283279e4e 100644
--- a/tests/kernels/core/test_opcheck.py
+++ b/tests/kernels/core/test_opcheck.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
index 35d62079fb65ded26b8f405a8f7056ec8565417b..e18f6230dbceabb3f507e683b6bc25be47bb12d9 100644
--- a/tests/kernels/core/test_permute_cols.py
+++ b/tests/kernels/core/test_permute_cols.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index f327deb0e549ec74d6d8f0eda45b57e18b152529..ab6f1ccf881fddb921a07d0f73933510f319c017 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate, product
 from typing import Callable, Optional
@@ -70,7 +71,7 @@ def test_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     if rotary_dim is None:
         rotary_dim = head_size
@@ -135,7 +136,7 @@ def test_batched_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -203,7 +204,7 @@ def test_batched_rotary_embedding_multi_lora(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: int = 10000,
+    base: float = 10000,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 8383f943b9fa4705212982ed3c16b5d3916ac470..db0fdcbf5ef227e72031fce88b0c09838eaac641 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
index f641ae7b67c2d4423c8e5c8daee65d732b64a3a7..c71215e4c646ba64a62fff37cbf7ab0169df4f08 100644
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
index 93064e23dd7d1dabfb1cb2100329198aa2219cff..addb8bfcda137b00fcc15a0cbc83f32bdfce19ed 100644
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index abcf3888fea26100c7686f2ad193291d4a841522..f5c6a18614ff72526dea7e453dc373bbddbd1697 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import unittest
 
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 84d4c347e0d81683b6ee882ea2924635015b6935..8dece26ddb29c2519fb92513fb3d233b6c7387bb 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index f5e751bea4149fdc7d4685808ace79b16ad21b42..abed1252a3ce613bb352a893aaa817e3da4c377a 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/moe/__init__.py b/tests/kernels/moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..117f1babdf62a092160947b9da949e5fbc1d22f4
--- /dev/null
+++ b/tests/kernels/moe/deepep_utils.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+DeepEP test utilities
+"""
+import dataclasses
+import importlib
+import traceback
+from typing import Callable, Optional
+
+import torch
+from torch.distributed import ProcessGroup
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+## Parallel Processes Utils
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            "tcp://localhost:29500",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+## DeepEP specific utils
+
+
+@dataclasses.dataclass
+class DeepEPHTArgs:
+    num_local_experts: int
+
+
+@dataclasses.dataclass
+class DeepEPLLArgs:
+    max_tokens_per_rank: int
+    hidden_size: int
+    num_experts: int
+    use_fp8_dispatch: bool
+
+
+def make_deepep_ht_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       ht_args: DeepEPHTArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # high throughput a2a
+    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
+    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
+    buffer = deep_ep.Buffer(group=pg,
+                            num_nvl_bytes=num_nvl_bytes,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=low_latency_mode,
+                            num_qps_per_rank=num_qps_per_rank)
+    return DeepEPHTPrepareAndFinalize(buffer=buffer,
+                                      world_size=pgi.world_size,
+                                      rank=pgi.rank,
+                                      dp_size=dp_size,
+                                      rank_expert_offset=pgi.rank *
+                                      ht_args.num_local_experts,
+                                      quant_dtype=q_dtype,
+                                      block_shape=block_shape)
+
+
+def make_deepep_ll_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       deepep_ll_args: DeepEPLLArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # low-latency a2a
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
+        pgi.world_size, deepep_ll_args.num_experts)
+
+    buffer = deep_ep.Buffer(group=pg,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=True,
+                            num_qps_per_rank=deepep_ll_args.num_experts //
+                            pgi.world_size)
+
+    return DeepEPLLPrepareAndFinalize(
+        buffer=buffer,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
+        quant_dtype=q_dtype,
+        block_shape=block_shape,
+        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
+    )
+
+
+def make_deepep_a2a(pg: ProcessGroup,
+                    pgi: ProcessGroupInfo,
+                    dp_size: int,
+                    deepep_ht_args: Optional[DeepEPHTArgs],
+                    deepep_ll_args: Optional[DeepEPLLArgs],
+                    q_dtype: Optional[torch.dtype] = None,
+                    block_shape: Optional[list[int]] = None):
+    if deepep_ht_args is not None:
+        assert deepep_ll_args is None
+        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
+                                  block_shape)
+
+    assert deepep_ll_args is not None
+    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
+                              block_shape)
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 7d369edfc86a4393ecc1cbc243ee83439b8db084..b0e0feab4689b0ccf771013f283f6d656ddfe982 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 7db4fe0f46e3fab44a9637cd75453eaa2b7482ff..474745f94815fcbb91708d84e4c5aee64b991823 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 from typing import Optional
 
@@ -192,14 +193,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
 
     kwargs = {
         'a': moe_tensors.a,
-        'w1_q': moe_tensors.w1_q.transpose(1, 2),  # type: ignore[union-attr]
-        'w2_q': moe_tensors.w2_q.transpose(1, 2),  # type: ignore[union-attr]
+        'w1_q': moe_tensors.w1_q,  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q,  # type: ignore[union-attr]
         'topk_weights': topk_weights,
         'topk_ids': topk_ids,
-        'ab_strides1': moe_tensors.ab_strides1,
-        'c_strides1': moe_tensors.c_strides1,
-        'ab_strides2': moe_tensors.ab_strides2,
-        'c_strides2': moe_tensors.c_strides2,
         'w1_scale': moe_tensors.w1_scale,
         'w2_scale': moe_tensors.w2_scale,
         'a1_scale': moe_tensors.a_scale
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d7cf39a8cca52aa26d44e1f785a0e27750d6e2a
--- /dev/null
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -0,0 +1,513 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test DeepEP + DeepGEMM integration 
+DeepGEMM are gemm kernels specialized for the
+fp8 block-quantized case.
+"""
+
+import dataclasses
+import importlib
+from typing import Optional
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+from typing_extensions import ParamSpec
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+
+try:
+    import deep_gemm
+    has_deep_gemm = True
+except ImportError:
+    has_deep_gemm = False
+
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+if has_deep_gemm:
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep,
+    reason="Requires deep_ep kernels",
+)
+
+requires_deep_gemm = pytest.mark.skipif(
+    not has_deep_gemm,
+    reason="Requires deep_gemm kernels",
+)
+
+P = ParamSpec("P")
+
+
+def next_power_of_2(x):
+    import math
+    if x == 0:
+        return 1
+    return 2**math.ceil(math.log2(x))
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1q, w2q, w1_scale, w2_scale
+    """
+    dtype = torch.bfloat16
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
+    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10
+    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
+    k_tiles_w1 = (k + block_k - 1) // block_k
+    n_tiles_w2 = (k + block_n - 1) // block_n
+    k_tiles_w2 = (n + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
+                       device="cuda",
+                       dtype=torch.float32)
+    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
+                       device="cuda",
+                       dtype=torch.float32)
+
+    assert w1_s.shape == (e, (2 * n + 127) // 128, (k + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(e):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    return w1, w2, w1_s, w2_s
+
+
+@dataclasses.dataclass
+class TestConfig:
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+    block_size: list[int]
+    # configs for testing low-latency kernels
+    low_latency: bool
+    use_fp8_dispatch: Optional[bool] = False
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, rank) -> "TestTensors":
+
+        dtype = torch.bfloat16
+        topk, m, k, block_size = (config.topk, config.m, config.k,
+                                  config.block_size)
+
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        rank_tokens = torch.randn(
+            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
+
+        block_k = block_size[1]
+        _, rank_token_scales = per_token_group_quant_fp8(rank_tokens, block_k)
+
+        topk_ids = torch.randint(
+            low=0,
+            high=config.num_experts,
+            size=(m, topk),
+            device=torch.cuda.current_device()).to(dtype=torch.int64)
+
+        topk_weights = torch.randn(topk_ids.shape,
+                                   dtype=torch.float32,
+                                   device=torch.cuda.current_device())
+
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk_ids,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           max_tokens_per_rank: int, dp_size: int,
+                           hidden_size: int, q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert test_config.low_latency
+    assert test_config.use_fp8_dispatch is not None
+
+    a2a: DeepEPLLPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=None,
+        deepep_ll_args=DeepEPLLArgs(
+            max_tokens_per_rank=max_tokens_per_rank,
+            hidden_size=hidden_size,
+            num_experts=test_config.num_experts,
+            use_fp8_dispatch=test_config.use_fp8_dispatch),
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = BatchedDeepGemmExperts(max_num_tokens=max_tokens_per_rank,
+                                           world_size=pgi.world_size,
+                                           dp_size=dp_size,
+                                           block_shape=test_config.block_size)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           dp_size: int, num_local_experts: int,
+                           q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert not test_config.low_latency
+    assert test_config.use_fp8_dispatch is None
+
+    a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
+        deepep_ll_args=None,
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = DeepGemmExperts()
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
+                        num_local_experts: int,
+                        test_tensors: TestTensors) -> FusedMoEModularKernel:
+
+    q_dtype = torch.float8_e4m3fn
+    test_config = test_tensors.config
+
+    mk: FusedMoEModularKernel
+    # Make modular kernel
+    if test_config.low_latency:
+        max_tokens_per_rank = max(
+            64, next_power_of_2(test_tensors.rank_tokens.size(0)))
+        hidden_size = test_tensors.rank_tokens.size(-1)
+
+        mk = make_ll_modular_kernel(pg=pg,
+                                    pgi=pgi,
+                                    max_tokens_per_rank=max_tokens_per_rank,
+                                    dp_size=dp_size,
+                                    hidden_size=hidden_size,
+                                    q_dtype=q_dtype,
+                                    test_config=test_config)
+    else:
+        mk = make_ht_modular_kernel(pg, pgi, dp_size, num_local_experts,
+                                    q_dtype, test_config)
+
+    return mk
+
+
+def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                             dp_size: int, test_tensors: TestTensors,
+                             w1: torch.Tensor, w2: torch.Tensor,
+                             w1_scale: Optional[torch.Tensor],
+                             w2_scale: Optional[torch.Tensor]) -> torch.Tensor:
+
+    test_config = test_tensors.config
+    num_experts = test_config.num_experts
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        num_local_experts=num_local_experts,
+        test_tensors=test_tensors)
+
+    # Low-Latency kernels can't dispatch scales.
+    a1_scale = (None
+                if test_config.low_latency else test_tensors.rank_token_scales)
+
+    out = mk.forward(hidden_states=test_tensors.rank_tokens,
+                     w1=w1,
+                     w2=w2,
+                     topk_weights=test_tensors.topk_weights,
+                     topk_ids=test_tensors.topk,
+                     inplace=False,
+                     activation="silu",
+                     global_num_experts=num_experts,
+                     expert_map=build_expert_map(),
+                     w1_scale=w1_scale,
+                     w2_scale=w2_scale,
+                     w1_zp=None,
+                     w2_zp=None,
+                     a1_scale=a1_scale,
+                     a2_scale=None,
+                     apply_router_weight_on_input=False)
+    return out
+
+
+def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
+                topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                a1_scale: torch.Tensor, block_shape: list[int]):
+
+    return fused_experts(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        block_shape=block_shape,
+        # Make sure this is set to False so we
+        # dont end up comparing the same implementation.
+        allow_deep_gemm=False)
+
+
+def _test_deepep_deepgemm_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    w1_scale = w1_scale.to(device=torch.cuda.current_device())
+    w2_scale = w2_scale.to(device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, pgi.rank)
+    block_shape = [
+        w1.size(1) // w1_scale.size(1),
+        w1.size(2) // w1_scale.size(2)
+    ]
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        triton_moe = triton_impl(a=test_tensors.rank_tokens,
+                                 topk_ids=test_tensors.topk,
+                                 topk_weights=test_tensors.topk_weights,
+                                 w1=w1,
+                                 w2=w2,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=test_tensors.rank_token_scales,
+                                 block_shape=block_shape)
+
+        # Slice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+        w1_scale_ep = w1_scale[e_start:e_end]
+        w2_scale_ep = w2_scale[e_start:e_end]
+
+        deepep_moe = deepep_deepgemm_moe_impl(
+            pg,
+            pgi,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+        )
+
+    torch.testing.assert_close(
+        triton_moe,
+        deepep_moe,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (8, 128, 128),
+    (8, 128, 512),
+    (8, 512, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (129, 128, 256),
+    (129, 1024, 2048),
+    (222, 1024, 2048),
+]
+
+TOPKS = [2, 6]
+NUM_EXPERTS = [32]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
+                                topk: int, world_dp_size: tuple[int, int]):
+    """
+    Tests for High-Throughput DeepEP + DeepGemm integration.
+    """
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts,
+                        block_size=block_size,
+                        low_latency=False,
+                        use_fp8_dispatch=None)
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+# Fix tests for USE_FP8_DISPATCH=True
+USE_FP8_DISPATCH = [False]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@pytest.mark.parametrize("block_size", [[128, 128]])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+def test_ll_deepep_deepgemm_moe(mnk: tuple[int, int,
+                                           int], num_experts: int, topk: int,
+                                use_fp8_dispatch: bool, block_size: list[int],
+                                world_dp_size: tuple[int, int]):
+    """
+    Tests for Low-Latency DeepEP + DeepGemm integration.
+    """
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(
+        topk=topk,
+        m=m,
+        k=k,
+        n=n,
+        num_experts=num_experts,
+        block_size=block_size,
+        low_latency=True,
+        use_fp8_dispatch=use_fp8_dispatch,
+    )
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e029ea9505559c1dff2e9df31aff1bd726c4513
--- /dev/null
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test deepep dispatch-combine logic
+"""
+
+import dataclasses
+import importlib
+from typing import Optional, Union
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep,
+    reason="Requires deep_ep kernels",
+)
+
+MAX_TOKENS_PER_RANK = 64
+
+
+def make_weights(
+        e, n, k, dtype
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1_scale, w2_scale
+    """
+    if dtype in [torch.float16, torch.bfloat16]:
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        return w1, w2, None, None
+
+    # per-out-channel weight quantization
+    assert dtype == torch.float8_e4m3fn
+    w1 = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float16)
+    w2 = torch.empty((e, k, n), device="cuda", dtype=torch.float16)
+
+    n_b_scales = 2 * n
+    k_b_scales = k
+    w1_q = torch.empty_like(w1, dtype=dtype)
+    w2_q = torch.empty_like(w2, dtype=dtype)
+    w1_scale = torch.empty((e, n_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    for expert in range(e):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+            w1[expert], use_per_token_if_dynamic=True)
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+            w2[expert], use_per_token_if_dynamic=True)
+    return w1_q, w2_q, w1_scale, w2_scale
+
+
+@dataclasses.dataclass
+class TestConfig:
+    dtype: torch.dtype
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
+        # TODO (varun) - check that float16 works ?
+        assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn]
+        token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn
+                       else config.dtype)
+        rank_tokens = torch.randn(
+            (config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        rank_token_scales = None
+        if config.dtype == torch.float8_e4m3fn:
+            # low_latency_mode kernels dont support per-token quant.
+            _, rank_token_scales = ops.scaled_fp8_quant(
+                rank_tokens, use_per_token_if_dynamic=not low_latency_mode)
+
+        topk = torch.randint(low=0,
+                             high=config.num_experts,
+                             size=(config.m, config.topk),
+                             device="cuda").to(dtype=torch.int64)
+        topk_weights = torch.randn(topk.shape,
+                                   dtype=torch.float32,
+                                   device="cuda")
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                        low_latency_mode: bool, hidden_size: int, dp_size: int,
+                        num_experts: int, num_local_experts: int,
+                        q_dtype: Optional[torch.dtype],
+                        use_fp8_dispatch: bool) -> FusedMoEModularKernel:
+
+    is_quantized = q_dtype is not None
+
+    ht_args: Optional[DeepEPHTArgs] = None
+    ll_args: Optional[DeepEPLLArgs] = None
+
+    if low_latency_mode:
+        ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK,
+                               hidden_size=hidden_size,
+                               num_experts=num_experts,
+                               use_fp8_dispatch=use_fp8_dispatch)
+    else:
+        assert not use_fp8_dispatch, (
+            "FP8 Dispatch is valid only for low-latency kernels")
+        ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
+
+    a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \
+        make_deepep_a2a(pg = pg,
+                        pgi = pgi,
+                        dp_size = dp_size,
+                        q_dtype = q_dtype,
+                        block_shape = None,
+                        deepep_ht_args = ht_args,
+                        deepep_ll_args = ll_args)
+
+    if low_latency_mode:
+        fused_experts = BatchedTritonExperts(
+            max_num_tokens=MAX_TOKENS_PER_RANK,
+            world_size=pgi.world_size,
+            dp_size=dp_size,
+            use_fp8_w8a8=is_quantized,
+            use_int8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False)
+    else:
+        fused_experts = TritonExperts(use_fp8_w8a8=is_quantized,
+                                      use_int8_w8a8=False,
+                                      use_int8_w8a16=False,
+                                      use_int4_w4a16=False,
+                                      per_channel_quant=False)
+
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                     low_latency_mode: bool, dp_size: int,
+                     test_tensors: TestTensors, w1: torch.Tensor,
+                     w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                     w2_scale: Optional[torch.Tensor], num_experts: int,
+                     use_fp8_dispatch: bool) -> torch.Tensor:
+
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    hidden_size = test_tensors.rank_tokens.size(1)
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    q_dtype = None
+    if is_quantized:
+        q_dtype = torch.float8_e4m3fn
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(pg, pgi, low_latency_mode,
+                                                    hidden_size, dp_size,
+                                                    num_experts,
+                                                    num_local_experts, q_dtype,
+                                                    use_fp8_dispatch)
+
+    out_hidden_states = torch.empty_like(test_tensors.rank_tokens)
+    total_num_tokens = test_tensors.rank_tokens.size(0)
+
+    def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+        rank_tokens_chunk = test_tensors.rank_tokens[chunk_start:chunk_end]
+        topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end]
+        topk_chunk = test_tensors.topk[chunk_start:chunk_end]
+        rank_token_scales_chunk = test_tensors.rank_token_scales
+        if rank_token_scales_chunk is not None and rank_token_scales_chunk.size(
+                0) == total_num_tokens:
+            # per act token
+            rank_token_scales_chunk = rank_token_scales_chunk[
+                chunk_start:chunk_end]
+
+        out = mk.forward(hidden_states=rank_tokens_chunk,
+                         w1=w1,
+                         w2=w2,
+                         topk_weights=topk_weights_chunk,
+                         topk_ids=topk_chunk,
+                         inplace=False,
+                         activation="silu",
+                         global_num_experts=num_experts,
+                         expert_map=build_expert_map(),
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         w1_zp=None,
+                         w2_zp=None,
+                         a1_scale=rank_token_scales_chunk,
+                         a2_scale=None,
+                         apply_router_weight_on_input=False)
+
+        if not skip_result_store:
+            out_hidden_states[chunk_start:chunk_end, :].copy_(
+                out, non_blocking=True)
+
+    max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK
+                             if low_latency_mode else total_num_tokens)
+
+    for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp):
+        chunk_start = chunk_start_
+        chunk_end = min(chunk_start + max_num_tokens_per_dp, total_num_tokens)
+        # clamp start and end
+        chunk_start = min(chunk_start, total_num_tokens - 1)
+        chunk_end = min(chunk_end, total_num_tokens)
+
+        process_chunk(chunk_start,
+                      chunk_end,
+                      skip_result_store=chunk_start_ >= total_num_tokens)
+
+    return out_hidden_states
+
+
+def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor,
+                   w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                   w2_scale: Optional[torch.Tensor], using_fp8_dispatch: bool):
+
+    a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
+                                 test_tensors.topk_weights)
+    if using_fp8_dispatch:
+        # The DeepEP implementation is requested to dispatch using FP8.
+        # For numerical stability for testing, emulate the fp8 dispatch by
+        # blockwise quant and de-quant.
+        a = test_tensors.rank_tokens
+        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
+            a.shape).to(a.dtype)
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    a_dtype = a.dtype
+    if is_quantized:
+        w1 = w1.to(dtype=torch.float32) * w1_scale
+        w2 = w2.to(dtype=torch.float32) * w2_scale
+        a = a.to(dtype=torch.float32)
+
+    m, _ = a.shape
+    topk = topk_ids.size(1)
+    out = torch.zeros_like(a)
+
+    for i in range(m):
+        a_i = a[i]
+        o_i = out[i]
+        for j in range(topk):
+            e = topk_ids[i][j]
+            e_w = topk_weights[i][j]
+            w1_e = w1[e]
+            w2_e = w2[e]
+            o_i += (SiluAndMul()
+                    (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w
+
+    if is_quantized:
+        out = out.to(dtype=a_dtype)
+
+    return out
+
+
+def _deep_ep_moe(
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    use_fp8_dispatch: bool,
+):
+
+    if not low_latency_mode:
+        assert not use_fp8_dispatch, (
+            "FP8 dispatch interface is available only in low-latency mode")
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    if is_quantized:
+        w1_scale = w1_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+        w2_scale = w2_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, low_latency_mode)
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
+                                        w2_scale, use_fp8_dispatch)
+
+        # Splice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+
+        w1_scale_ep, w2_scale_ep = None, None
+        if is_quantized:
+            w1_scale_ep = w1_scale[e_start:e_end]  # type: ignore
+            w2_scale_ep = w2_scale[e_start:e_end]  # type: ignore
+        deepep_combined = deep_ep_moe_impl(
+            pg,
+            pgi,
+            low_latency_mode,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+            config.num_experts,
+            use_fp8_dispatch,
+        )
+
+    torch.testing.assert_close(
+        torch_combined,
+        deepep_combined,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (222, 1024, 2048),
+]
+
+DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                     num_experts: int, topk: int, world_dp_size: tuple[int,
+                                                                       int]):
+    low_latency_mode = False
+    use_fp8_dispatch = False
+    m, n, k = mnk
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
+USE_FP8_DISPATCH = [True, False]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@requires_deep_ep
+def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                                 num_experts: int, topk: int,
+                                 world_dp_size: tuple[int, int],
+                                 use_fp8_dispatch: bool):
+
+    low_latency_mode = True
+    m, n, k = mnk
+
+    if (low_latency_mode
+            and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES):
+        pytest.skip(
+            f"Skipping test as hidden size {k} is not in list of supported "
+            f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
+        )
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 299279390fe0c85ee3ef4abcf5d66e3ddcddbadc..7238813a299d6d42aee237daa0f70f0cc467bf14 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_moe.py`.
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 10e6ac64df877e0f3e7d9bda6095ce62afb8da15..7cc83b512c8b94a37de13eaf06980319af73b7d3 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE permute/unpermute kernel
 
 Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index ae63b379f39d10fc72f49f2222d3354f101ba9e8..22482d9ca85a6aabd3c486619fde0870dc379f0f 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
@@ -79,7 +80,10 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
                 w2[expert], w2_gs[expert])
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
 
         a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
         a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3e6adcfa364c126ff97eff7566aefdf46bedd8
--- /dev/null
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.pplx_utils import ProcessGroupInfo, parallel_launch
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.platforms import current_platform
+
+try:
+    from pplx_kernels import AllToAll
+    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                      nvshmem_finalize, nvshmem_get_unique_id,
+                                      nvshmem_init)
+    has_pplx = True
+except ImportError:
+    has_pplx = False
+
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
+def chunk_by_rank(t, r, w):
+    num = t.shape[0]
+    chunk = rank_chunk(num, r, w)
+    rem = num % w
+    if rem == 0 or r < rem:
+        return t[(r * chunk):(r + 1) * chunk].contiguous()
+    else:
+        long_chunks = (num // w + 1) * rem
+        short_chunks = (r - rem) * chunk
+        start = long_chunks + short_chunks
+        return t[start:start + chunk].contiguous()
+
+
+def pplx_cutlass_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    num_tokens, hidden_dim = a.shape
+    num_experts = w1.shape[0]
+    block_size = hidden_dim  # TODO support more cases
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
+    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
+    topk = topk_ids.shape[1]
+
+    if block_size == hidden_dim:
+        scale_elems = 4  # hack to circumvent pplx data format requirements
+    else:
+        scale_elems = (hidden_dim + block_size - 1) // block_size
+
+    ata = AllToAll.internode(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
+        hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
+    )
+
+    w1 = w1.to(device)
+    w2 = w2.to(device)
+    w1_scale = w1_scale.to(device)
+    w2_scale = w2_scale.to(device)
+    a1_scale = a1_scale.to(device)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens,
+        pgi.world_size,
+        rank,
+        dp_size,
+        quant_dtype=torch.float8_e4m3fn,
+        per_act_token=per_act_token,
+    )
+
+    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+                                out_dtype, per_act_token, per_out_ch)
+
+    fused_cutlass_experts = FusedMoEModularKernel(
+        prepare_finalize,
+        experts,
+    )
+
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weights, rank,
+                                      world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank,
+                                   world_size).to(torch.uint32).to(device)
+
+    out = fused_cutlass_experts(
+        a_chunk,
+        chunk_by_rank(w1, rank, world_size),
+        chunk_by_rank(w2, rank, world_size),
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts,
+        expert_map=None,  #TODO
+        w1_scale=chunk_by_rank(w1_scale, rank, world_size),
+        w2_scale=chunk_by_rank(w2_scale, rank, world_size),
+        a1_scale=chunk_by_rank(a1_scale, rank, world_size)
+        if per_act_token else a1_scale[rank])
+
+    torch.cuda.synchronize()
+
+    ata.destroy()
+
+    return out[:rank_num_tokens]
+
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def torch_moe2(a, w1, w2, topk_weight, topk_ids):
+    M, K = a.shape
+    topk = topk_ids.shape[1]
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    num_experts = w1.shape[0]
+    for i in range(num_experts):
+        mask = (topk_ids == i).view(-1)
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+
+    return (out.view(M, -1, w2.shape[1]) *
+            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def _pplx_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    a_full: torch.Tensor,
+    w1_full: torch.Tensor,
+    w2_full: torch.Tensor,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    uid = nvshmem_get_unique_id(
+    ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+    torch.distributed.broadcast(uid, src=0)
+    nvshmem_init(uid, pgi.rank, pgi.world_size)
+
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe2(a_full, w1_full, w2_full, topk_weights,
+                                  topk_ids)
+        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                       w2_scale, topk_weights, topk_ids,
+                                       a1_scale, out_dtype, per_act_token,
+                                       per_out_ch)
+
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pplx_output.device)
+
+    # Uncomment if more debugging is needed
+    # print("PPLX OUT:", pplx_output)
+    # print("TORCH OUT:", torch_output)
+
+    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
+
+    nvshmem_finalize()
+
+
+@pytest.mark.parametrize("m", [2, 224])
+@pytest.mark.parametrize("n", [3072])
+@pytest.mark.parametrize("k", [1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+@requires_pplx
+def test_cutlass_moe_pplx(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    world_dp_size: tuple[int, int],
+):
+    current_platform.seed_everything(7)
+
+    with set_current_vllm_config(vllm_config):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        world_size, dp_size = world_dp_size
+        a_scale1 = torch.randn(
+            (m if per_act_token else 1, 1), device="cuda",
+            dtype=torch.float32) / 10.0
+        if not per_act_token:
+            a_scale1 = a_scale1.repeat(world_size, 1)
+
+        parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q,
+                        w1_scale, w2_scale, topk_weights, topk_ids, a_scale1,
+                        dtype, a, w1_d, w2_d, per_act_token, per_out_ch)
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 8c4a2c3fa440ff47ae8fe0797e93c12c95fd6a37..0b48bbef6cebf1a897fadf8dd6b65baec69ff32d 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-import dataclasses
-import os
-import traceback
-from typing import Callable, Optional
+from typing import Optional
 
 import pytest
 import torch
@@ -20,10 +18,7 @@ try:
 except ImportError:
     has_pplx = False
 
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
-
+from tests.pplx_utils import ProcessGroupInfo, parallel_launch
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import override_config
@@ -35,6 +30,11 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
 PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
                        (222, 2048, 1024)]
 
@@ -56,122 +56,6 @@ vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
 
-P = ParamSpec("P")
-
-requires_pplx = pytest.mark.skipif(
-    not has_pplx,
-    reason="Requires PPLX kernels",
-)
-
-
-@dataclasses.dataclass
-class ProcessGroupInfo:
-    world_size: int
-    world_local_size: int
-    rank: int
-    node_rank: int
-    local_rank: int
-    device: torch.device
-
-
-def _worker_parallel_launch(
-    local_rank: int,
-    world_size: int,
-    world_local_size: int,
-    node_rank: int,
-    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
-    device = torch.device("cuda", local_rank)
-    torch.distributed.init_process_group(
-        backend="cpu:gloo,cuda:nccl",
-        init_method=init_method,
-        rank=rank,
-        world_size=world_size,
-        device_id=device,
-    )
-    barrier = torch.tensor([rank], device=device)
-    torch.distributed.all_reduce(barrier)
-
-    try:
-        worker(
-            ProcessGroupInfo(
-                world_size=world_size,
-                world_local_size=world_local_size,
-                rank=rank,
-                node_rank=node_rank,
-                local_rank=local_rank,
-                device=device,
-            ),
-            *args,
-            **kwargs,
-        )
-    except Exception as ex:
-        print(ex)
-        traceback.print_exc()
-        raise
-    finally:
-        torch.distributed.destroy_process_group()
-
-
-def parallel_launch(
-    world_size: int,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    assert not kwargs
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_size,
-            0,
-            "tcp://localhost:29500",
-            worker,
-        ) + args,
-        nprocs=world_size,
-        join=True,
-    )
-
-
-def parallel_launch_from_env(
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    """
-    Launches a worker function in parallel across all processes in the current
-    environment. The environment must have the following variables set:
-    - WORLD_SIZE: The total number of processes.
-    - WORLD_LOCAL_SIZE: The number of processes on the current node.
-    - NODE_RANK: The rank of the current
-    - MASTER_ADDR: The address of the master process.
-    - MASTER_PORT: The port of the master process.
-    """
-    assert not kwargs
-    world_size = int(os.environ["WORLD_SIZE"])
-    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
-    node_rank = int(os.environ["NODE_RANK"])
-    assert "MASTER_ADDR" in os.environ
-    assert "MASTER_PORT" in os.environ
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_local_size,
-            node_rank,
-            "env://",
-            worker,
-        ) + args,
-        nprocs=world_local_size,
-        join=True,
-    )
-
 
 def torch_prepare(
     a: torch.Tensor,
@@ -390,7 +274,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
 
-    b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare(
+    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
         a_chunk,
         None,
         None,
diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index b0d34ddfd4234bc0c41b7a8bc54d88d3cbf770d6..1c51c530c193c12470913f1272d37dc3fad1f722 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # This is a test for the AITER ops.
 # It tests if the AITER ops are
 # 1. correctly registered as custom ops
@@ -35,6 +36,15 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
     assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk)
 
 
+def test_rocm_aiter_grouped_topk_custom_op_registration():
+    """Test that the custom op is correctly registered."""
+    # Check if the op exists in torch.ops.vllm
+    assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk')
+
+    # Check if the op is callable
+    assert callable(torch.ops.vllm.rocm_aiter_grouped_topk)
+
+
 def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
     """Test that the op can be used with torch.compile."""
     # Create test tensors
@@ -120,3 +130,87 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
                           rtol=1e-2,
                           atol=1e-2)
     assert torch.allclose(topk_ids_original, topk_ids_compiled)
+
+
+def test_rocm_aiter_grouped_topk_torch_compile_compatibility():
+    """Test that the op can be used with torch.compile."""
+    # Create test tensors
+    token = 64
+    expert = 256
+    num_expert_group = 8
+    topk = 8
+    topk_group = 4
+    renormalize = True
+    scoring_func = "softmax"
+    scale_factor = 1.0
+
+    gating_output = torch.randn((token, expert),
+                                dtype=torch.bfloat16,
+                                device="cuda")
+
+    device = gating_output.device
+    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+    topk_weights = torch.empty((token, topk),
+                               dtype=torch.float32,
+                               device=device)
+
+    # Define a function that uses the op
+    def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func):
+        return torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output, topk_weights, topk_ids, num_expert_group,
+            topk_group, renormalize, scoring_func, scale_factor)
+
+    # Verify the op's fake implementation
+    torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk,
+                          (gating_output, topk_weights, topk_ids),
+                          kwargs={
+                              "num_expert_group": num_expert_group,
+                              "topk_group": topk_group,
+                              "need_renorm": renormalize,
+                              "scoring_func": scoring_func,
+                              "routed_scaling_factor": scale_factor
+                          },
+                          test_utils=("test_faketensor"))
+
+    # Compile the function with appropriate settings
+    compiled_fn = torch.compile(grouped_topk_fn,
+                                fullgraph=True,
+                                backend="inductor",
+                                mode="reduce-overhead",
+                                dynamic=False)
+
+    topk_weights_original = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_original = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    topk_weights_compiled = torch.empty((token, topk),
+                                        dtype=torch.float32,
+                                        device=device)
+    topk_ids_compiled = torch.empty((token, topk),
+                                    dtype=torch.int32,
+                                    device=device)
+
+    # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
+    grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original,
+                    scoring_func)
+    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled,
+                scoring_func)
+
+    # Sort the results for comparison since the order might not be deterministic
+    topk_ids_original, indices_original = torch.sort(topk_ids_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1,
+                                         indices_original)
+
+    topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
+                                         indices_compiled)
+
+    # Verify results match
+    assert torch.allclose(topk_weights_original,
+                          topk_weights_compiled,
+                          rtol=1e-2,
+                          atol=1e-2)
+    assert torch.allclose(topk_ids_original, topk_ids_compiled)
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 3b5838a99fa156c398b8063b185c32cfa7ff1046..dfd0f35c8da3d4039cdcc2fbc73ae0bf5ca54786 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
 import itertools
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 764924f26783db5115b0624aae5a66ba210298a9..0840cc7b54fcb4f3af38bc89cd2804ba1509dc4e 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
@@ -8,7 +9,7 @@ from vllm.platforms import current_platform
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
-ROCM_FP8_MAX = 224.0
+ROCM_FP8FNUZ_MAX = 224.0
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
@@ -26,9 +27,11 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 
     qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
             else torch.finfo(quant_dtype)
-    qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
+    qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                         else qtype_traits.max
-    qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
+    qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                         else qtype_traits.min
     qtype_max = as_float32_tensor(qtype_traits_max)
     s_1 = as_float32_tensor(1.0)
@@ -70,9 +73,11 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
                     -> tuple[torch.tensor, torch.tensor]:
 
     fp8_traits = torch.finfo(FP8_DTYPE)
-    fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
+    fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                     else fp8_traits.max
-    fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
+    fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
+                                            and current_platform.is_fp8_fnuz() \
                                     else fp8_traits.min
     fp8_max = as_float32_tensor(fp8_traits_max)
     one = as_float32_tensor(1.0)
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index 58eaeee1c0b88204073cc244ad916ede944ce149..1095975ab2b419eec32a7f899d7a324eaddcf12a 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 from vllm.scalar_type import scalar_types
diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
index 896e0265738b7791c6410f86dbdd8bb09062ac51..3de9cb364468453fce03a30f648569df19519238 100644
--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
index 7d36172815b78672f24afe651e0717051d256a74..427db3e602921cd243ccee2e09a1c698ef3dd091 100644
--- a/tests/kernels/quantization/test_aqlm.py
+++ b/tests/kernels/quantization/test_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py
index 248b294e546b3450d562df5966636196153ec142..bc0868123d82ac01555fc999d5eba997ab778291 100644
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 3fc3feaf4972c3665b57ccaff2fe12b6ad112315..96797e85bd1255338be802527ad46ca717dc6852 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.
 
 Run `pytest tests/kernels/test_awq_triton.py`.
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index ae05d61173f3336a432fb0c2169dbd8b9593cce1..8c5ee98743d72a4b48294bed340378e924350845 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import itertools
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index a4e9f83f0eaf1ddebadec2f4b73a9c9e81638948..fa2c9f890d6fb0f08942ebd082826d5cb0b702ea 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
 import itertools
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index d67d2dbb8998101e063df58f576ca5c49c010cf4..878f66647e19e935808e191a19ddab5a267d487b 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels
 
 Run `pytest tests/kernels/test_semi_structured.py`.
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 633addd421f4389f89e3f085109db741f1f97078..c4d349f1a5a0950b28ec6d5506c0a43faa658398 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels
 
 Run `pytest tests/kernels/test_cutlass.py`.
@@ -631,7 +632,8 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
                        b_tensors_stacked, a_scales_tensors_stacked,
                        b_scales_tensors_stacked, expert_offsets[:-1],
-                       problem_sizes, ab_strides, ab_strides, c_strides)
+                       problem_sizes, ab_strides, ab_strides, c_strides,
+                       per_act_token, per_out_ch)
 
     # Validate each group's result against the baseline
     for g in range(num_experts):
diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index 876cf03fd644c9076aad2f49dc35fe9f0caca954..0a3edd4ddc16a2d61a205683f16d041f1744cb57 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py
index 73697a6d1242dd4c1c7add0d305b96ad21e633e0..07651fef39bf4bc69e512d7721cfcda49d8bbb76 100644
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gguf
 import pytest
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index ad755fe7f7a0bf4937ec34e287e529b7c84820e8..436d5cb640219797f512a9f40b2d48f1b0df4b3c 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
 
diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py
index fea013d9e5795aef49574c2d238a5a09165d6eaf..7fb57a1576bd86c57c081f7c651fa34ce85f791b 100644
--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
index 4c7543527c323f42cc67aca3fec9a106098c79b3..dc5fecbf4ccc8f67d201b185c6f872fe4c5ac04b 100644
--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
 import itertools
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index 25dcb587e4878ebaebeab4b667acb2f909c9bc61..63ccf4a917369ebef38e1d5ad21122dd6b20ccb5 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 5aeaaa654ed6006e777ed5b7dcb8b0e95a9f9bd3..998171baaf2dec492d3bdd361de0a28fc4de67db 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.
 
 Run `pytest tests/kernels/test_machete_mm.py`.
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 52507b375c2717c645062902a7be72592beb470c..92914bd5cbba7d438b924aa924ba0d1e5f57230a 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.
 
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
index b8aa1672100e2ccad5206b02e853e44646dc60b1..3a8f4c17598c2f84842c0d5e2d42468eb5f3c222 100644
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 1f49900b2d90b8df868bde205521e1f5ee81cc2e..0b45c229817524dba575dc247c4be61b26c5705b 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index c7eee899896acea5ad15a0377d29212a4250feb3..533a4fe5967799b4788d40c56a70a403d6426d61 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 30e6eeb8d5660f4dddba72cc4bbe12a3823f6101..8a2cc3baced23a129069eff36d43f719a2b2d6e5 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel
 
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
new file mode 100644
index 0000000000000000000000000000000000000000..9115949a165143b2faa008a1fbdb622e80bddf6a
--- /dev/null
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import (apply_repetition_penalties_cuda,
+                              apply_repetition_penalties_torch)
+from vllm.platforms import current_platform
+
+NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
+# [stress, stress, stress, Qwen, llama 4]
+VOCAB_SIZES = [17, 256, 1019, 151936, 202048]
+REPETITION_PENALTY_VALUES = [1.05]
+SEEDS = [0]
+DTYPES = [torch.float32, torch.float16]
+
+
+@pytest.mark.parametrize("num_seqs", NUM_SEQS)
+@pytest.mark.parametrize("vocab_size", VOCAB_SIZES)
+@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties(
+    num_seqs: int,
+    vocab_size: int,
+    repetition_penalty: float,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """
+    Test the apply_repetition_penalties custom op 
+    against a reference implementation.
+    """
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # Mark some tokens as repeated in prompt and output
+    prompt_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+
+    for i in range(num_seqs):
+        prompt_mask[i, prompt_indices[i]] = True
+        output_mask[i, output_indices[i]] = True
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index 87e4bd4b096b3bd2ebe832554de84cb3dac54256..2b745b84dae6c125f8825477f48276503198d3ea 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
@@ -75,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
     pack_factor = 128 // block_size
     block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
 
-    q = torch.randn(bs, h_q, d)
+    # Amplify input values to ensure test coverage of edge cases where CUTLASS
+    # kernel errors occur with split_k settings.
+    q = torch.randn(bs, h_q, d) * 100
     block_table = torch.randint(0,
                                 bs * block_num, (bs, block_num),
                                 dtype=torch.int32)
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..040ddac10258f5d7abae57cc3b679855770e2a11
--- /dev/null
+++ b/tests/kernels/test_flex_attention.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Integration tests for FlexAttention backend vs default backend"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+from packaging import version
+
+from vllm import LLM, SamplingParams
+
+TORCH_VERSION = version.parse(torch.__version__)
+MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+
+
+def set_seed(seed):
+    """Set seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_flex_attention_vs_default_backend(monkeypatch):
+    """Test that FlexAttention produces the same outputs as the default backend.
+
+    This test compares the outputs from the FlexAttention backend with
+    the default backend, ensuring they are identical when using the same seed.
+    """
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    seed = 42
+    max_tokens = 32
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     top_p=1.0,
+                                     seed=seed,
+                                     max_tokens=max_tokens)
+
+    # Run with flex attention
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+        set_seed(seed)
+
+        llm_flex = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_flex = llm_flex.generate(prompts, sampling_params)
+
+    # Run with default backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        set_seed(seed)
+        llm_default = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_default = llm_default.generate(prompts, sampling_params)
+
+    # Compare outputs from both backends
+    for i, (flex_result,
+            default_result) in enumerate(zip(output_flex, output_default)):
+        prompt = prompts[i]
+        flex_text = flex_result.outputs[0].text
+        default_text = default_result.outputs[0].text
+
+        assert flex_text == default_text, (
+            f"FlexAttention output doesn't match default for: {prompt!r}\n"
+            f"FlexAttention: {flex_text!r}\n"
+            f"Default: {default_text!r}")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
index faa8d49ce41be5990e2560e220d36b6716a94f92..803453a20d81d360b6fa5b1f3c02f564458be16d 100644
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
index cf2bdc908e420135d17cecbcb39808c67b6364b7..1c31cfb25e5acd11213f84adaed692f744e182af 100644
--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_flash_attention kernel
 
 Run `pytest tests/kernels/test_triton_flash_attention.py`.
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 22b3d7c2be7a5c9c5104d8b8e8149b697b5f4574..d1db6a8eb1ba4bfa4f9cfc2300f545f057df48da 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Kernel test utils"""
 
 import itertools
diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py
index dc948a48bf3267da4273fb8ee459cd5186e14150..9f2229cc41dff6809332fffdff24c026753ef627 100644
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import subprocess
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index c5b34660d1658d8ffab150742fcd08cf48d2daf2..352ab63552de74bd194a5908e11b460bfbc5578a 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import random
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
index 8a6490b5c8876f1d573e40004498a79f6b3e30e1..7a04174870dafaac44c7789108d49f4f8de2ead1 100644
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import subprocess
 import sys
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 3dd923d24050cb22d18da161e536aaa3e3d16949..32116608a2177ac5315327a05f27dbb103c4c3cb 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 399311ce65bb823faaa86ba1f07845277fc54b62..4908f9a060f7f27cd2fb633433ff3a476abb4be1 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
 from collections import OrderedDict
@@ -163,11 +164,6 @@ def mixtral_lora_files():
     return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
-@pytest.fixture(scope="session")
-def gemma_lora_files():
-    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
-
-
 @pytest.fixture(scope="session")
 def chatglm3_lora_files():
     return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index c8b7a5cbf74703ddd00b1cc9edb3eee0b2c30711..cc8160b2860d91b80434434ff78723d809085e95 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 
@@ -6,6 +7,8 @@ import pytest
 
 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -16,14 +19,6 @@ LORA_RANK = 64
 DEFAULT_MAX_LORAS = 4 * 3
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def get_lora_requests(lora_path) -> list[LoRARequest]:
     lora_requests: list[LoRARequest] = [
         LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
@@ -88,17 +83,6 @@ async def test_add_lora(chatglm3_lora_files):
         trust_remote_code=True,
         enforce_eager=True)
 
-    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
-    # environment variable. reload vllm.enging.async_llm_engine as
-    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
-    # env var.
-    import importlib
-
-    import vllm.engine.async_llm_engine
-    importlib.reload(vllm.engine.async_llm_engine)
-    from vllm.entrypoints.openai.api_server import (
-        build_async_engine_client_from_engine_args)
-
     # split lora_requests into 3 parts
     part_size = len(lora_requests) // 3
     dummy_run_requests = lora_requests[:part_size]
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 007be7aa582ea62de2e3e3b699100c511ad5c97b..774ebb9db21065fc616c36b2aff4cd388289dc60 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 2c18a115be48738a82d1f3647ba8f6ea8f6e6363..5481b413b8f5f02c6ce9831b9a9723e694868790 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import pytest
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
 from vllm.lora.request import LoRARequest
@@ -18,14 +17,6 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 0a8b38fa748a6cbcb6555523efc5977c95374f7c..92db023babc28efb6c0bb2e424f021c30a246425 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from copy import deepcopy
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 580992dea53da03543e54118d154127fa254d780..3ac3b80ec827c967f9572fa56b5a417cbd7d8ea6 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
 from typing import Union
 
-import pytest
-import ray
-
 import vllm
 from vllm import LLM
 from vllm.lora.request import LoRARequest
@@ -33,14 +31,6 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
@@ -128,37 +118,6 @@ def test_llama_lora(sql_lora_files):
     generate_and_test(llm, sql_lora_files)
 
 
-# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
-# used by the engine yet.
-@pytest.mark.skip_v1
-@create_new_process_for_each_test()
-def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and
-    is more conservative"""
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_lora():
-        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
-        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
-        return num_gpu_blocks_lora_warmup
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_no_lora():
-        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = (
-            llm.llm_engine.cache_config.num_gpu_blocks)
-        return num_gpu_blocks_no_lora_warmup
-
-    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
-    num_gpu_blocks_no_lora_warmup = ray.get(
-        get_num_gpu_blocks_no_lora.remote())
-    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more "
-        "conservative than without lora, therefore the number of "
-        "memory blocks for the KV cache should be "
-        "less when using lora than when not using lora")
-
-
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index 094541aef02bb893cf2d77211b90db713374e8bb..01bc102bd112bc945d88437181218f07286fbd47 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 02f2339bef01ddfad8b708ee2e035c9328f3b8f7..ebc0f26378d27ee388079779b10a44f3526a0a15 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 7ae33a848a0aa5f481db8d8ebfce83882cdcbd61..50c60341f0d88b92df7c09304d211b5323730e92 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
-
-import os
-
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.llm_engine import LLMEngine
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
@@ -16,14 +16,6 @@ LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
 LORA_RANK = 8
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def make_lora_request(lora_id: int):
     return LoRARequest(lora_name=f"{lora_id}",
                        lora_int_id=lora_id,
@@ -79,22 +71,6 @@ def test_lora_functions_sync():
 @pytest.mark.asyncio
 async def test_lora_functions_async():
 
-    if os.getenv("VLLM_USE_V1") == "0":
-        pytest.skip(
-            reason=
-            "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
-
-    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
-    # environment variable. reload vllm.enging.async_llm_engine as
-    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
-    # env var.
-    import importlib
-
-    import vllm.engine.async_llm_engine
-    importlib.reload(vllm.engine.async_llm_engine)
-    from vllm.entrypoints.openai.api_server import (
-        build_async_engine_client_from_engine_args)
-
     max_loras = 4
     engine_args = AsyncEngineArgs(model=MODEL_PATH,
                                   enable_lora=True,
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 90498c47fb10431e1727a5dc5cf1f6b16f234226..b46d81f1651a62d4964828701528f936d897d178 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 52b0834cacb8598b050d4476099edfccfb46d2ea..8f8a27006cf67ddd0737c28d0bf4e8c029a12a37 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 24242b8a17594324e8423fcedd42d4ad1318a9b3..99fe951bbf07009d1fd5c67bfc3c9c56e965bf88 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index aea7691935dfe1e21aba229876039029efbb5b50..0ea07793311cb829aae3fa9cac3b1f25e4cbf804 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
@@ -10,14 +11,6 @@ from vllm.platforms import current_platform
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
               prompts: list[str]) -> list[str]:
 
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9935472ad18f457e1398520af12c0b12d7915f64..f16589e06b2dcd276b0de80df5490704923b94de 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import math
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 7375cabbc36d905c0c80e0321dffcbb1a9e705a6..9d75512a248bee9030b567fc5ae4b0598cfd7027 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
@@ -10,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
@@ -58,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip_v1
+@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
 def test_phi2_lora(phi2_lora_files):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index add313c945446c313f46eea06c5f131bc314a634..14fa79ae5b4469ae8b402c682402f5e839ed89b5 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from threading import Lock
 
 import pytest
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index caf71976a2608b64114a2b43427a1d3a80fb03ee..caa31fdb0e73ef3cd7176829a10a7992fe9565a0 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
@@ -24,27 +25,19 @@ if current_platform.is_rocm():
     MODELS = [
         ModelWithQuantization(
             model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="GPTQ"),
+            quantization="gptq"),
     ]
 else:
     MODELS = [
         ModelWithQuantization(
             model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-            quantization="AWQ"),
+            quantization="awq"),
         ModelWithQuantization(
             model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="GPTQ"),
+            quantization="gptq"),
     ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
@@ -100,7 +93,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
             "#ff8050",
             "#ff8080",
         ]
-    elif model.quantization == "AWQ":
+    elif model.quantization == "awq":
         expected_no_lora_output = [
             "I'm sorry, I don't understand",
             "I'm sorry, I don't understand",
@@ -109,7 +102,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
             "#f07700: A v",
             "#f00000: A v",
         ]
-    elif model.quantization == "GPTQ":
+    elif model.quantization == "gptq":
         expected_no_lora_output = [
             "I'm sorry, I don't have",
             "I'm sorry, I don't have",
@@ -122,7 +115,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
     def expect_match(output, expected_output):
         # HACK: GPTQ lora outputs are just incredibly unstable.
         # Assert that the outputs changed.
-        if (model.quantization == "GPTQ"
+        if (model.quantization == "gptq"
                 and expected_output is expected_lora_output):
             assert output != expected_no_lora_output
             for i, o in enumerate(output):
@@ -172,7 +165,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
                                  model):
     if num_gpus_available < 2:
         pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-    if model.quantization == "GPTQ":
+    if model.quantization == "gptq":
         pytest.skip("GPTQ lora outputs are just incredibly unstable")
     llm_tp1 = vllm.LLM(
         model=model.model_path,
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 7bd3e3d0fe27f7a79ae321c7a0abf0696b367fab..604bb307b889d1e3f71024002b49fd8c1db4df07 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from typing import Optional
 
@@ -10,14 +11,7 @@ import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
-
-
-@pytest.fixture(autouse=not current_platform.is_cpu())
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
+from vllm.sampling_params import BeamSearchParams
 
 
 @dataclass
@@ -69,7 +63,7 @@ class Qwen2VLTester:
                  expected_outputs: list[str],
                  lora_id: Optional[int] = None,
                  temperature: float = 0,
-                 max_tokens: int = 5) -> list[str]:
+                 max_tokens: int = 5):
 
         sampling_params = vllm.SamplingParams(
             temperature=temperature,
@@ -97,7 +91,35 @@ class Qwen2VLTester:
                 generated), f"Generated text {generated} doesn't "
             f"match expected pattern {expected}"
 
-        return generated_texts
+    def run_beam_search_test(self,
+                             images: list[ImageAsset],
+                             expected_outputs: list[list[str]],
+                             lora_id: Optional[int] = None,
+                             temperature: float = 0,
+                             beam_width: int = 2,
+                             max_tokens: int = 5):
+
+        beam_search_params = BeamSearchParams(beam_width=beam_width,
+                                              max_tokens=max_tokens,
+                                              temperature=temperature)
+
+        inputs = [{
+            "prompt": self.PROMPT_TEMPLATE,
+            "multi_modal_data": {
+                "image": asset.pil_image
+            },
+        } for asset in images]
+
+        lora_request = LoRARequest(str(lora_id), lora_id,
+                                   self.config.lora_path)
+        outputs = self.llm.beam_search(inputs,
+                                       beam_search_params,
+                                       lora_request=lora_request)
+
+        for output_obj, expected_outs in zip(outputs, expected_outputs):
+            output_texts = [seq.text for seq in output_obj.sequences]
+            assert output_texts == expected_outs, \
+                f"Generated texts {output_texts} do not match expected {expected_outs}"  # noqa: E501
 
 
 TEST_IMAGES = [
@@ -110,6 +132,14 @@ EXPECTED_OUTPUTS = [
     "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]
 
+# NOTE - beam search .text contains the whole text
+EXPECTED_BEAM_SEARCH_OUTPUTS = [
+    [
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands",  # noqa: E501
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall",  # noqa: E501
+    ],
+]
+
 QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
@@ -130,6 +160,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
                         lora_id=lora_id)
 
 
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
+    """Test Qwen 2.0 VL model with LoRA through beam search."""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
+                        lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        # NOTE currently, we only test cherry blossom since stop sign
+        # output is slightly different for v1; - the root cause is likely
+        # independent of the intent of this test, which is to ensure beam
+        # search passes through lora through correctly.
+        tester.run_beam_search_test(
+            [ImageAsset("cherry_blossom")],
+            expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
+            lora_id=lora_id)
+
+
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
index 8ebc2ae98fc4341c162610d7566db0ebb012170c..6c93e577611f83adcac09729b41405fc6fed7761 100644
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index 8845eb33d207e173d02c4878a51c7767a11d1987..6cfdaf50d33c4126c4b34b2164f2f8631f6f7a45 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 63907f2c1d02cf3d019c70078007af7c858d6090..5065a2fb71649ace93a34b2ff78b5e03a3a66150 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 0d4e0bf681f2c05ddc23af1d8d3129cf163ed51c..b343bef0a920b55b7a295ffc93fa9d25c1212bf4 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import OrderedDict
 from typing import NamedTuple, Optional
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index e5ae660af1400aa6bf7b4e680116681a8bdc57b1..9999c1be54ea5fee24b80f93bc21463a9f71ddf1 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import random
@@ -6,8 +7,6 @@ import tempfile
 from typing import Union
 from unittest.mock import patch
 
-import pytest
-
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
@@ -17,13 +16,7 @@ from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
+NUM_LORAS = 16
 
 
 @patch.dict(os.environ, {"RANK": "0"})
@@ -67,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
-            gpu_memory_utilization=1.0,
             swap_space=0,
             cache_dtype="auto",
         ),
-        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
-                               max_loras=32),
+        lora_config=LoRAConfig(max_lora_rank=8,
+                               max_cpu_loras=NUM_LORAS,
+                               max_loras=NUM_LORAS),
     )
     worker = worker_cls(
         vllm_config=vllm_config,
@@ -87,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
     set_active_loras(worker, [])
     assert worker.list_loras() == set()
 
-    n_loras = 32
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files)
+        for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
@@ -98,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
         for lora_request in lora_requests
     }
 
-    for i in range(32):
+    for i in range(NUM_LORAS):
         random.seed(i)
         iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, n_loras))
+                                            k=random.randint(1, NUM_LORAS))
         random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
         set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
             {lora_request.lora_int_id
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 59a0e7420fc25e0419ff42a64997b3af3cf427a8..cc1b0d81955bc05c08ae7e5321724092bd15b5f5 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional, Union
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e71c87ff3fc8228c70f90fb40d67817714532b12..7bb5d8980d614a561a20e62c4a0852b5d0717631 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py
index 39ab01c9b8741b26081100690f58d1f82a0df927..e89e60c5a02ec859456a7e184bb8a37eafd6bec8 100644
--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/mistral_tool_use/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py
index bbb3a07895f6c1310326e6211620cb5a0787a9e1..9bf6863f3f2b7a3ef5001ebf981eab25ca2b4299 100644
--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
index 1d809a05e89d1776ee44a967aa7b6ae4fd757abe..7a026cd9bb619d43fae63b0b97af2bb45a835ae7 100644
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
index b588a1a96638bdc265e434696471f9d6a13e5bb0..c6d89d849e9f9f24785aa2d44f145f209d3364e1 100644
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index e957db5b3f16a0425cdf2ce8a2c6c302dc35b05d..a94215ee397bf23086ef5ba4823b66a08a212a28 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 6cd966f84802bc703d9bef89228d77927111869c..ac31064d9212089bad65ef79aa647ad02b9a55c7 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import pickle
diff --git a/tests/test_logits_processor.py b/tests/model_executor/test_logits_processor.py
similarity index 98%
rename from tests/test_logits_processor.py
rename to tests/model_executor/test_logits_processor.py
index 8301c645b79f8ce8ea2b7633ebc03699a8a89930..532ebba038d38cceec508916d10009fb7993686b 100644
--- a/tests/test_logits_processor.py
+++ b/tests/model_executor/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import patch
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index f8efa2eff857b944db62a85c8c99222e85210483..94a14bd24bcb67365d4a5af234faa3e812491369 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
 import pytest
 
-from vllm.model_executor.layers.pooler import CLSPool, PoolingType
+from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -14,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
 REVISION = os.environ.get("REVISION", "main")
 
 MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    "intfloat/multilingual-e5-small")
+                                    "intfloat/multilingual-e5-base")
 REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 
@@ -40,17 +41,15 @@ def test_model_loading_with_params(vllm_runner):
 
         # asserts on the pooling config files
         assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
-        assert model_config.pooler_config.pooling_norm
+        assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
         assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
-        assert model_tokenizer.tokenizer_config["do_lower_case"]
         assert model_tokenizer.tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, BertEmbeddingModel)
-            assert model._pooler.pooling_type == PoolingType.CLS
-            assert model._pooler.normalize
+            assert isinstance(model._pooler, CLSPool)
 
         vllm_model.apply_model(check_model)
 
@@ -80,16 +79,15 @@ def test_roberta_model_loading_with_params(vllm_runner):
 
         # asserts on the pooling config files
         assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
-        assert model_config.pooler_config.pooling_norm
+        assert model_config.pooler_config.normalize
 
         # asserts on the tokenizer loaded
-        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small"
-        assert not model_tokenizer.tokenizer_config["do_lower_case"]
+        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base"
+        assert model_tokenizer.tokenizer.model_max_length == 512
 
         def check_model(model):
             assert isinstance(model, RobertaEmbeddingModel)
-            assert model._pooler.pooling_type == PoolingType.MEAN
-            assert model._pooler.normalize
+            assert isinstance(model._pooler, MeanPool)
 
         vllm_model.apply_model(check_model)
 
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/test_weight_utils.py
similarity index 96%
rename from tests/model_executor/weight_utils.py
rename to tests/model_executor/test_weight_utils.py
index bdaba22c3c7a8962485bf976d671f8d6b077a0af..df625b8d60049e37a6b7a6c827bb595661525fda 100644
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import tempfile
diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py
index 8ab0167dc771d2b8d50b6bedcb33de01cfa23acb..7d8acab5e834380d0b66f870e27dfba21f1f445d 100644
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 05dd18fbdf8b341538add0f68a3aa77b77ab53f3..f656f90c4bd37b12ceb68939539b2afd94ad75d7 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Optional
 
@@ -86,7 +87,6 @@ AITER_MODEL_LIST = [
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.cpu_model],
         )
     ])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py
index f381c34f44b8ca546f00f268ffade99ee303c0b5..2a39f78a708eed6127ae9800a617a93380261fdc 100644
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from ...utils import check_logprobs_close
diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py
index da3f5e1100bfd37b6d419f7707ae4d795857d14f..952449f284159623212c810d4a04537bf9d6fe90 100644
--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 604cb854b32ff95547a1f95abfc4a8fc71770098..3eaadcb45fe129b7c831ac21d88d117b5833c16a 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index c1b612ae213b97b2035007584fcbf7b9f77a90e4..bdd857ff50620b9bd2fce57fc957e7cabd59809d 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import json
 
diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
index 603ca1cb12a5b61f7bf75d327a6d5b750c12745c..6c9cc2821c30fbe2e72337a72b3204b38d8c87d8 100644
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dabd7bee7f39338d59702bbb5729402e4c46ee0f
--- /dev/null
+++ b/tests/models/language/pooling/embed_utils.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+
+from tests.conftest import HfRunner
+from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
+                                matryoshka_fy)
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+def correctness_test_embed_models(hf_runner,
+                                  vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts,
+                                  vllm_extra_kwargs=None,
+                                  hf_model_callback=None):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    with hf_runner(
+            model_info.name,
+            dtype="float32",
+            is_sentence_transformer=True,
+    ) as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index f83c9940d5243d7a3a42c2f70ea41852192ae42e..0a047951db443a1d942ae6982b768b4d206a0f92 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 
 import mteb
@@ -6,7 +7,6 @@ import numpy as np
 import pytest
 
 from tests.models.utils import EmbedModelInfo
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 
 # Most models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -80,18 +80,19 @@ def run_mteb_embed_task_st(model_name, tasks):
 def mteb_test_embed_models(hf_runner,
                            vllm_runner,
                            model_info: EmbedModelInfo,
-                           vllm_extra_kwargs=None):
+                           vllm_extra_kwargs=None,
+                           hf_model_callback=None):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
         pytest.skip("Skipping test.")
 
     vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
                      task="embed",
                      max_model_len=None,
-                     dtype=model_info.dtype,
                      **vllm_extra_kwargs) as vllm_model:
 
         if model_info.architecture:
@@ -101,17 +102,19 @@ def mteb_test_embed_models(hf_runner,
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
-        model_dtype = getattr(
-            vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
-            vllm_dtype)
 
-    with set_default_torch_dtype(model_dtype) and hf_runner(
-            model_info.name, is_sentence_transformer=True,
-            dtype=model_dtype) as hf_model:
+    with hf_runner(model_info.name,
+                   is_sentence_transformer=True,
+                   dtype="float32") as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+        st_dtype = next(hf_model.model.parameters()).dtype
 
     print("VLLM:", vllm_dtype, vllm_main_score)
-    print("SentenceTransformer:", model_dtype, st_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af3c05d3d9075c91391545e0e73278b7121bc50
--- /dev/null
+++ b/tests/models/language/pooling/test_baai.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("BAAI/bge-base-en",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("BAAI/bge-base-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("BAAI/bge-m3",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    ########## Qwen2Model
+    EmbedModelInfo("BAAI/bge-code-v1",
+                   architecture="Qwen2Model",
+                   dtype="float32",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 44af3df08a8673370006208e6c67119c7be4a1fe..4a6d781ce6f095f83a012ff8599411508295199e 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from transformers import AutoModelForSequenceClassification
@@ -43,6 +44,6 @@ def test_models(
 
         # the tolerance value of 1e-2 is selected based on the
         # half datatype tests in
-        # tests/models/embedding/language/test_embedding.py
+        # tests/models/language/pooling/test_embedding.py
         assert torch.allclose(hf_output, vllm_output,
                               1e-3 if dtype == "float" else 1e-2)
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index a44b2154b13762fa1aef7acb012d103a5d7126ac..9516a01421cbb1ceef31e82f14340ebccbf180ef 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from vllm.config import PoolerConfig
@@ -10,29 +11,31 @@ from ...utils import check_embeddings_close
 @pytest.mark.parametrize(
     "model",
     [
-        # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+        # Be careful of the order of models, decoder-only models should be
+        # placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct`
+        # case won't pass because gte-Qwen2-1.5B-instruct will cache custom
+        # model code with bidirectional attention.
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
         pytest.param("intfloat/e5-mistral-7b-instruct",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # [Encoder-only]
+        pytest.param("BAAI/bge-base-en-v1.5",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
-@pytest.mark.parametrize("dtype", ["half"])
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model,
-    dtype: str,
     monkeypatch,
 ) -> None:
 
@@ -44,7 +47,7 @@ def test_models(
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
         vllm_extra_kwargs["override_pooler_config"] = \
-            PoolerConfig(pooling_type="MEAN")
+            PoolerConfig(pooling_type="MEAN", normalize=False)
 
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
@@ -54,13 +57,11 @@ def test_models(
     # So we need to strip the input texts to avoid test failing.
     example_prompts = [str(s).strip() for s in example_prompts]
 
-    with hf_runner(model, dtype=dtype,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(model, is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
     with vllm_runner(model,
                      task="embed",
-                     dtype=dtype,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index f450edd821623426008724f195c6b71ebce8516c..c2f70bb647a4e82cc94ae3d8d49790a3dc3a80a3 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import importlib.util
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 91d10f529cd684da87125035297bb1ff2e177e34..05bd479f42b95abe98519a9c852f00cec9cd7b05 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -1,35 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import pytest
 
-from ...utils import EmbedModelInfo, run_embedding_correctness_test
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
     EmbedModelInfo("thenlper/gte-large",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=True),
     EmbedModelInfo("thenlper/gte-base",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-small",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-large-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-base-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-small-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     ########### NewModel
     EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
@@ -53,9 +49,8 @@ MODELS = [
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_mteb(hf_runner, vllm_runner,
-                     model_info: EmbedModelInfo) -> None:
-    from .mteb_utils import mteb_test_embed_models
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
 
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
@@ -66,28 +61,13 @@ def test_models_mteb(hf_runner, vllm_runner,
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                            example_prompts) -> None:
-    if not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # ST will strip the input texts, see test_embedding.py
-    example_prompts = [str(s).strip() for s in example_prompts]
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
 
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
 
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=model_info.dtype,
-                     max_model_len=None,
-                     **vllm_extra_kwargs) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(
-            model_info.name,
-            dtype=model_info.dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts, vllm_extra_kwargs)
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e83857fa70ed4cb39934ff3e31320632a79d06
--- /dev/null
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from ...utils import EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("intfloat/e5-small",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/e5-base",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/e5-large",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-small",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("intfloat/multilingual-e5-base",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/multilingual-e5-large",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 0ddff2146caa7fafd297e8c63c0048ce75a5e242..33255021ad6acd9f52d3e5a8abd79e4c072d2c1d 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from functools import partial
+
 import pytest
 
 from vllm import PoolingParams
 
-from ...utils import check_embeddings_close, matryoshka_fy
+from .embed_utils import (EmbedModelInfo, check_embeddings_close,
+                          correctness_test_embed_models, matryoshka_fy)
+from .mteb_utils import mteb_test_embed_models
 
 SCORING_MODELS = [
     "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
@@ -25,16 +30,9 @@ TEXTS_2 = [
 ]
 
 EMBEDDING_MODELS = [
-    "jinaai/jina-embeddings-v3",
-]
-
-EMBEDDING_PROMPTS = [
-    "Follow the white rabbit.",  # English
-    "Sigue al conejo blanco.",  # Spanish
-    "Suis le lapin blanc.",  # French
-    "跟着白兔走。",  # Chinese
-    "اتبع الأرنب الأبيض.",  # Arabic
-    "Folge dem weißen Kaninchen.",  # German
+    EmbedModelInfo("jinaai/jina-embeddings-v3",
+                   architecture="XLMRobertaModel",
+                   is_matryoshka=True)
 ]
 
 
@@ -80,73 +78,66 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
 
 
-@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
-def emb_model_name(request):
-    yield request.param
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
 
+    def hf_model_callback(model):
+        model.encode = partial(model.encode, task="text-matching")
 
-def test_is_matryoshka(vllm_runner, emb_model_name):
-    with vllm_runner(emb_model_name, task="embed",
-                     max_model_len=None) as vllm_model:
-        assert vllm_model.model.llm_engine.model_config.is_matryoshka
-
-
-@pytest.mark.parametrize("model", EMBEDDING_MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_embeddings(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype: str,
-    monkeypatch,
-) -> None:
+    mteb_test_embed_models(hf_runner,
+                           vllm_runner,
+                           model_info,
+                           hf_model_callback=hf_model_callback)
 
-    example_prompts = EMBEDDING_PROMPTS
 
-    with hf_runner(
-            model,
-            dtype=dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
 
-    with vllm_runner(model, task="embed", dtype=dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+    def hf_model_callback(model):
+        model.encode = partial(model.encode, task="text-matching")
 
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
+    correctness_test_embed_models(hf_runner,
+                                  vllm_runner,
+                                  model_info,
+                                  example_prompts,
+                                  hf_model_callback=hf_model_callback)
 
 
-@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("dimensions", [16, 32])
 def test_matryoshka(
     hf_runner,
     vllm_runner,
-    model,
+    model_info,
     dtype: str,
     dimensions: int,
+    example_prompts,
     monkeypatch,
 ) -> None:
+    if not model_info.is_matryoshka:
+        pytest.skip("Model is not matryoshka")
 
-    example_prompts = EMBEDDING_PROMPTS
+    # ST will strip the input texts, see test_embedding.py
+    example_prompts = [str(s).strip() for s in example_prompts]
 
     with hf_runner(
-            model,
+            model_info.name,
             dtype=dtype,
             is_sentence_transformer=True,
     ) as hf_model:
         hf_outputs = hf_model.encode(example_prompts, task="text-matching")
         hf_outputs = matryoshka_fy(hf_outputs, dimensions)
 
-    with vllm_runner(model, task="embed", dtype=dtype,
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
                      max_model_len=None) as vllm_model:
+        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+
         matryoshka_dimensions = (
             vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
         assert matryoshka_dimensions is not None
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 28df32e0c2301f1c68f85929674ad51076b15228..e16ec239a33814378e74d09db5f3c34b5c5a3206 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -1,50 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
-from ...utils import EmbedModelInfo, run_embedding_correctness_test
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=True),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
                    architecture="NomicBertModel",
-                   dtype="float32",
+                   enable_test=False),
+    EmbedModelInfo("nomic-ai/CodeRankEmbed",
+                   architecture="NomicBertModel",
                    enable_test=False),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=True)
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_mteb(hf_runner, vllm_runner,
-                     model_info: EmbedModelInfo) -> None:
-    from .mteb_utils import mteb_test_embed_models
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                            example_prompts) -> None:
-    if not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # ST will strip the input texts, see test_embedding.py
-    example_prompts = [str(s).strip() for s in example_prompts]
-
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=model_info.dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(
-            model_info.name,
-            dtype=model_info.dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
new file mode 100644
index 0000000000000000000000000000000000000000..250b3a52835aff15d6ed98c9d77720438df0796f
--- /dev/null
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: SIM117
+import pytest
+
+from ...utils import EmbedModelInfo
+
+MODELS = [
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
+    #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
+    #EmbedModelInfo("nomic-ai/CodeRankEmbed"),
+    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
+    #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
+]
+
+rope_theta = 1000
+factor = 4.0
+original_max_position_embeddings = 2048
+max_model_len = int(original_max_position_embeddings * factor)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_default(model_info, vllm_runner):
+    with vllm_runner(model_info.name, task="embed",
+                     max_model_len=None) as vllm_model:
+        model_config = vllm_model.model.llm_engine.model_config
+        if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
+            # For nomic-embed-text-v2-moe the length is set to 512
+            # by sentence_bert_config.json.
+            assert model_config.max_model_len == 512
+        else:
+            assert (
+                model_config.max_model_len == original_max_position_embeddings)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_set_max_model_len_legal(model_info, vllm_runner):
+    # set max_model_len <= 512
+    with vllm_runner(model_info.name, task="embed",
+                     max_model_len=256) as vllm_model:
+        model_config = vllm_model.model.llm_engine.model_config
+        assert model_config.max_model_len == 256
+
+    # set 512 < max_model_len <= 2048
+    if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
+        # For nomic-embed-text-v2-moe the length is set to 512
+        # by sentence_bert_config.json.
+        with pytest.raises(ValueError):
+            with vllm_runner(model_info.name, task="embed",
+                             max_model_len=1024):
+                pass
+    else:
+        with vllm_runner(model_info.name, task="embed",
+                         max_model_len=1024) as vllm_model:
+            model_config = vllm_model.model.llm_engine.model_config
+            assert model_config.max_model_len == 1024
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_set_max_model_len_illegal(model_info, vllm_runner):
+    # set max_model_len > 2048
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name, task="embed", max_model_len=4096):
+            pass
+
+    # set max_model_len > 2048 by hf_overrides
+    hf_overrides = {"max_model_len": 4096}
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=None,
+                         hf_overrides=hf_overrides):
+            pass
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_use_rope_scaling_legal(model_info, vllm_runner):
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        },
+        "max_model_len": max_model_len
+    }
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     hf_overrides=hf_overrides):
+        pass
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_use_rope_scaling_illegal(model_info, vllm_runner):
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        }
+    }
+    # illegal max_model_len
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=max_model_len + 1,
+                         hf_overrides=hf_overrides):
+            pass
+
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings":
+            original_max_position_embeddings
+        },
+        "max_model_len": max_model_len + 1
+    }
+    # illegal max_model_len by hf_overrides
+    with pytest.raises(ValueError):
+        with vllm_runner(model_info.name,
+                         task="embed",
+                         max_model_len=None,
+                         hf_overrides=hf_overrides):
+            pass
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index 6b10aeffc4b72c69a2a52d9aefc51eed01d9b2d8..c75ff14456169a6f5951232eca902aa7ea2c8dd3 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index 5679e0e1ce00bc3bd18ac74482fd75adaaf9f020..d6b5dbd08372ef44b04afca0841e305bed88653b 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
-from ...utils import EmbedModelInfo, run_embedding_correctness_test
+from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
@@ -41,37 +43,14 @@ MODELS = [
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_mteb(
-    hf_runner,
-    vllm_runner,
-    model_info: EmbedModelInfo,
-) -> None:
-    from .mteb_utils import mteb_test_embed_models
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_models_correctness(
-    hf_runner,
-    vllm_runner,
-    model_info: EmbedModelInfo,
-    example_prompts,
-) -> None:
-    if not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # ST will strip the input texts, see test_embedding.py
-    example_prompts = [str(s).strip() for s in example_prompts]
-
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=model_info.dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
-
-    with hf_runner(
-            model_info.name,
-            dtype=model_info.dtype,
-            is_sentence_transformer=True,
-    ) as hf_model:
-        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index 1b8ac395ed179c61aab296268ff534671a2d4cc3..33aff1c873fc46278d3ceadfd23529a1cdcbf8dd 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 077632117b55c4097fc0960be0329b16e519c204..034afce5167cfacd4f906cf1bb62cda5bbc40b3e 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
@@ -225,6 +226,8 @@ VLM_TEST_SETTINGS = {
         img_idx_to_prompt=lambda idx: "",
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
+        # FIXME: https://github.com/huggingface/transformers/pull/38510
+        marks=[pytest.mark.skip("Model is broken")],
     ),
     "chameleon": VLMTestInfo(
         models=["facebook/chameleon-7b"],
@@ -280,10 +283,10 @@ VLM_TEST_SETTINGS = {
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        dtype="bfloat16",
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+        num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
@@ -336,7 +339,8 @@ VLM_TEST_SETTINGS = {
         models=[
             "OpenGVLab/InternVL2-1B",
             "OpenGVLab/InternVL2-2B",
-            "OpenGVLab/Mono-InternVL-2B",
+            # FIXME: Config cannot be loaded in transformers 4.52
+            # "OpenGVLab/Mono-InternVL-2B",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
@@ -567,6 +571,8 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
+        # FIXME: https://github.com/huggingface/transformers/issues/38358
+        marks=[pytest.mark.skip("Model initialization fails")],
     ),
     "qwen2_vl": VLMTestInfo(
         models=["Qwen/Qwen2-VL-2B-Instruct"],
diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
index b8225f5f12437942234a1947edd0bac0f1bc1656..a622957f96f691b411d5a4042394e9579e3fa1df 100644
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
@@ -99,6 +100,8 @@ def run_test(
         )
 
 
+# FIXME: https://github.com/huggingface/transformers/issues/38358
+@pytest.mark.skip("Model initialization fails")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize(
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 96c444441e3d2c02ab5abf27b22a930b516199e8..c5ffa5f3a70afb243de7f92ee938f98ee21ce7c5 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional
@@ -28,7 +29,7 @@ def vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
-MODEL_NAME = "ibm-granite/granite-speech-3.3-8b"
+MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
 # Audio lora co-exists directly in the model directory, but
 # currently still needs to be passed directly to vLLM.
 audio_lora_path = MODEL_NAME
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 972db40e8bd61a40b1e9f5a44bd5a32ee0e212b4..949c0a80d31bcc65d276aecbdeea23b4f8e2d16e 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index 99aa3c2d3bd993a5cbfd6f49922c11c38b08d73a..2bb01e494d43611260600dc789def6254c79d11a 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, overload
 
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index e51dbee479c5541e5a3a282df6fb0b29285f7527..4e8465778e25676eb84088b0b95d655ec38ac4d6 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Sequence
@@ -121,6 +122,10 @@ def run_test(
             for prompts, images, audios in inputs
         ]
 
+    # This error occurs inside `get_peft_model`
+    # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
+    pytest.skip("HF impl is not compatible with current transformers")
+
     hf_model_kwargs = {"_attn_implementation": "sdpa"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 506b71472f4a8d1c45f54d82a6b3438dc72d2231..1def825ab0874daa426d08f1ba628927d0205856 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index 6be401b775ec281dab8f3da967671ca60329d575..a2793b8c8ddf7e50e96fa17a44eef75dc56cdfdc 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional, TypedDict, Union
 
diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
index 2c8a06688ca02525a1ecdc03560e08ea5c7c7bea..e7e7bd3154a113651688442298530224f349ba69 100644
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Any
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 4e48bdbd04289c9c9c90cd4ef61ea1c2f3a5c9f6..363d55153aac60050784e0eea705ff57018c545d 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
@@ -100,6 +101,7 @@ def run_test(
 
     with vllm_runner(
             model,
+            dtype="half",
             max_model_len=448,
             tensor_parallel_size=tensor_parallel_size,
             distributed_executor_backend=distributed_executor_backend,
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 32117c8d8dca0313ae286db88101d0cc175c0565..7d20dd66089bbb90c437f111a0b86220cccbb464 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helpers for building inputs that can be leveraged for different test types.
 """
 from collections.abc import Iterable
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index a5077a090b5231a8dc17aca49f68630627d7c552..336e2dd2b1201b4e58dff1173546ea65835da012 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for determining which subset of model tests belong to a specific
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index ccd2799abd90c929476fb107e7b64552fba6428b..8c83d8f8a8a22bf020d8b7d49431fdb470c3d702 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
 from typing import Any, Callable, Optional
 
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index cc10455611386b0e706f23fdafdcad18a201134d..aa5835243e0428e068cd96237ea1a581bc3ffadf 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
 from io import BytesIO
 from typing import Callable
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index dc1ea5208240d2d307454353c737a47025dc045a..af4c72f44b6761819214cb2f3f199cca1d20f5b9 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common utility functions relating to different models that are useful
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
@@ -9,11 +10,12 @@ from typing import Optional, Union
 
 import numpy as np
 import numpy.typing as npt
+import pytest
 import regex as re
 import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig)
+                          GenerationConfig, GenerationMixin)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
@@ -323,6 +325,16 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 
     hf_model.processor = processor
 
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        # FIXME: https://github.com/huggingface/transformers/issues/38333
+        kwargs["disable_compile"] = True
+
+        return orig_generate(*args, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
     return hf_model
 
 
@@ -609,6 +621,11 @@ def _internvl_generate(
     if getattr(self, "use_visual_token_mask", False):
         visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
         forward_kwargs["visual_token_mask"] = visual_token_mask
+
+    # e.g. InternVL2-2B
+    if not isinstance(self.language_model, GenerationMixin):
+        pytest.skip("HF impl is not compatible with current transformers")
+
     outputs = self.language_model.generate(
         **forward_kwargs,
         **generate_kwargs,
diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
index 9e8a1262e8c1c1c8d5b3cb637f6febb1ee5d6b1b..562f89df13470c05e535f2f589be3b8382dd5f73 100644
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 1c2bb4d6222b43215b0d961fd0649c4adfa7c055..0ec7909e744d7e55fa2fe3397ca9b04a9921b624 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
 from collections.abc import Iterable
 from enum import Enum
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index ea1caec0ecf34330dc763f1f9c4ddeb4f118fe8c..3734d87b7962ef0af34f69e4d08350d36e7a24e7 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable
 
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index 76f9fbe0255056db8f1e981a81412d422ee73e7c..3e2be34a50ad5ac72bf04d7e2118468ebf8f6c7d 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 77508738cc8700e5e1b87f08f6114ce2ca46f111..b6d90d2b0abed11b97b4c9f8468105a368f0083c 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch.nn.functional as F
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index cd58a5cb4531c3316a4d5c9d6c0fdb9993e828ce..b42ac6fb21eddb6fa1425d8682b5aa930d856260 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch.nn.functional as F
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 1895acd8968ddc82df6d4c80538533347b5b709e..1e6608955b31bde1a14700cfec02977c5db007f6 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
 from typing import Optional, Union
@@ -40,7 +41,7 @@ def _test_processing_correctness(
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
         seed=0,
-        dtype="float16",
+        dtype="auto",
         revision=None,
         hf_overrides=model_info.hf_overrides,
     )
@@ -244,7 +245,7 @@ def _test_processing_correctness_one(
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
-    "ibm-granite/granite-speech-3.3-8b",
+    "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
@@ -282,7 +283,7 @@ def _test_processing_correctness_one(
     "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "omni-research/Tarsier-7b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 37142b6dd36f16f6c5052734aed2bfa5c1bd6aba..76e4acc67d4d5f14275388e10b4b47a705997177 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
 from collections.abc import Mapping
 from typing import Optional
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index c35ce2f6ab291b927666a5a63346aea01d006cf0..d3a55993e5588baf172ebb1179f5ca4e343b9bcb 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
 import pytest
 from transformers import Idefics3Config
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 7ec81197a3db62cf6693c11fa1faac2ef0f22859..c3e2841a8f06038a4ba6125550b5b3f52f6fe25a 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
 from collections.abc import Mapping
 from typing import Optional
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 614f17dbbeda7a68fb839d23b693460077dd9777..9ef7af556291e5da52aebfdd2faf7cc4a8e8a691 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Llama4's multimodal preprocessing kwargs."""
 
 import pytest
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index b82bfe483dbbc5c9b2c7b2b335b8b20f68becb0f..ca34d1d758a46920de1020d9dc28803573b58913 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from functools import partial
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index dcc8dc8dab5a0725234128d798ac34867650a32e..e6344c4e7e6fd1916fcfc147c04ade60cebe1aa4 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from functools import partial
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 9bd2b9887294fe87d2bca3ea7a10f0feed6f795d..9387212e3f1017a97d6854e32691265e3e6087f8 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from PIL import Image
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index d4794396f6d20fc747851dc0fb94cf9a276d3452..a6b20a1e3678ef9e67cd246ab2ee32477b88547c 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
 import pytest
 from transformers import MllamaConfig
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index b53351544c458983227589ec20eb625f4bd2e83a..1f3646f7948685265a4bd2402afa90f7feca743b 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index c6e272650e08bceee8abe177056328f51221748c..f16d261c2c6a4551c827af4859d656016088f6a4 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 02abe1ca8b024167754cc252b2205207a472d152..9d1cd183387bc94c4891e3350fd027280a4d34a1 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 224d1bcedb96654f449314fc3ca753f876dd871c..af8f983388c6cb89105bfce98d60856a52a593cd 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
 import pytest
 from transformers import SmolVLMConfig
diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py
index 1272a62974cc8fdaaf9a5957d3b4f1485a427761..de6851e2fc282fbbec0f8c8ee02261f80f73274d 100644
--- a/tests/models/quantization/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index 597c8e48fb64dc36266bec4c1c46ea408854a4d2..bd696198931ff1b067098c98edf5a593593c4860 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py
index f0781394d81d1687e21fe10040d76eb2fbc29b16..754ac9a29a13254fc356f2bf67e3356672c3eff8 100644
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a bitblas model.
 
 Note: GPTQ and bitblas do not have bitwise correctness.
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index e01ee2026393534c19c3f11dc2afd40e320d068c..10914abf9ad3df7f5fac5c1ff85e637a957d19c4 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests fp8 models against ground truth generation
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 5f17d12284a04184cc6550318ac75f0107883060..32f9472c12d5e5865355ef4aa65ce216dab1849b 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests gguf models against unquantized models generations
 Note: To pass the test, quantization higher than Q4 should be used
@@ -37,7 +38,7 @@ class GGUFTestConfig(NamedTuple):
 LLAMA_CONFIG = GGUFTestConfig(
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
-    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+    gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
index c8e96455fd0c564b5033ea0e6841e71974e91b24..c3aed77525de9a7f490861a95da39ed2dc77ac51 100644
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a bitblas model.
 
 Note: GPTQ and bitblas do not have bitwise correctness.
diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
index 397bdb98123f191c2230187ee9e16aa1ace6e1ba..db70a3bd2c0461d808723f8c8970444b1a4d10d4 100644
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compares the outputs of gptq vs gptq_marlin.
 
 Note: GPTQ and Marlin do not have bitwise correctness.
diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
index 6fb24b1f432e6b082af6407c16ed279dd9d18d12..9b86ae95ba5c7bae625238e70caf1986159f8389 100644
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a Marlin_24 model.
 
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
index 1d9aa4fa8adea9d4da6046df6bfbf2dc920123d2..6ad526cc893f3722ed24c82b203e39fc2e57bb2d 100644
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py
index 9a060829525e17f799cba027d6930b6f8476d300..7b8a334bbc369f2f61090c488bb744db581d3169 100644
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
 """Tests Quark mxfp4 models against ground truth generation
 """
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index 510858c2d7ef24833032648a60b13f5a0b907d20..b95dad9a4effed0632bf61161e186e9810db5f16 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 4f74eb47000a8e79b336a3ccb87be7acff75efa5..3055b7a8f78ec77452fb05b9e697d10f1f554446 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
@@ -159,17 +160,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
-                                          is_available_online=False,
-                                          min_transformers_version="4.52.2"),
+                                          min_transformers_version="4.53"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
-    "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-0414",
-        is_available_online=False,
-        min_transformers_version="4.52.dev0"
-    ),
+    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
     "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
@@ -180,8 +176,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                           {"1b": "EleutherAI/pythia-1.4b"}),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview",  # noqa: E501
-                                                   min_transformers_version="4.52.0"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
@@ -202,8 +197,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
-                                         is_available_online=False),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
     "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                          trust_remote_code=True),
@@ -218,6 +212,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
+    "NemotronHForCausalLM": _HfExamplesInfo("nvidia/Nemotron-H-8B-Base-8K",
+                                            trust_remote_code=True),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
@@ -242,10 +238,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
-                                     is_available_online=False),
+    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
-                                                is_available_online=False),
+                                                v0_only=True),
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
                                            v0_only=True),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
@@ -255,7 +250,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
                                             trust_remote_code=True),
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
-                                         is_available_online=False,
+                                         tokenizer="meta-llama/Llama-2-7b",
                                          trust_remote_code=True),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
@@ -274,8 +269,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                                trust_remote_code=True),
     "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
                                    trust_remote_code=True,
-                                   hf_overrides={"architectures":
-                                                     ["GteNewModel"]}),
+                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
@@ -283,7 +277,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
                                 trust_remote_code=True),
-    "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long",  # noqa: E501
+    "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
                                                trust_remote_code=True),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
@@ -297,10 +291,8 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
-    # The model on Huggingface is currently being updated,
-    # hence I temporarily mark it as not available online
-    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",  # noqa: E501
-                                            is_available_online=False),
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                            is_available_online=False),  # noqa: E501
 }
 
 _CROSS_ENCODER_EXAMPLE_MODELS = {
@@ -326,8 +318,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
-    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-8b",  # noqa: E501
-                                                             min_transformers_version="4.52.0"),  # noqa: E501
+    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
@@ -346,7 +337,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       trust_remote_code=True,
                                                       v0_only=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      min_transformers_version="4.51",
                                                       max_model_len=10240),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
@@ -359,8 +349,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
-                                max_transformers_version="4.48",
-                                transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
@@ -398,20 +386,20 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
-    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
-                                        min_transformers_version="4.52"),
-    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ",  # noqa: E501
-                                                           min_transformers_version="4.52"),
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
+                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
     # [Encoder-decoder]
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
-                                                         trust_remote_code=True,),  # noqa: E501
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
+                                                         trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
@@ -435,6 +423,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                                             tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
+    "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
+                                            trust_remote_code=True,
+                                            is_available_online=False,
+                                            speculative_model="openbmb/MiniCPM-2B-sft-bf16",
+                                            tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
     "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                     trust_remote_code=True,
                                     speculative_model="XiaomiMiMo/MiMo-7B-RL")
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index d403cb392fe0601c27e4b5402bb2ac0bb6ef4fc0..54e8cd597bfc49a38f2a256b83e54e733ea2e6ea 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
@@ -20,6 +21,10 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    # FIXME: Possible memory leak in the previous tests?
+    if model_arch == "GraniteSpeechForConditionalGeneration":
+        pytest.skip("Avoid OOM")
+
     # Avoid OOM and reduce initialization time by only using 1 layer
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
@@ -40,6 +45,13 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
                 "num_hidden_layers": 1,
             })
 
+        # e.g.: ibm-granite/granite-speech-3.3-2b
+        if hasattr(hf_config, "encoder_config"):
+            hf_config.encoder_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
         return hf_config
 
     # Avoid calling model.forward()
@@ -74,6 +86,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
             } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,
+            # these tests seem to produce leftover memory
+            gpu_memory_utilization=0.80,
             load_format="dummy",
             hf_overrides=hf_overrides,
         )
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index b62720caa9cb51b0587213eab09154eb98eba33f..ef0ad613d5252c5611df16626286b9d5b6760495 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 4c5572a569ea0b21962931990dfc69cd905bda90..6b623fa0003552642e3b17eb1395c783c8acc14f 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 1a51b4aeab04d4f00ef2130675b0b3ca7ae37024..b7b99ce41cbb0cfbae534163a9af3414c2a60d98 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the functionality of the Transformers backend."""
 from typing import Any, Optional, Union
 
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
index a16384efe1956f277b5ebf28f27dfffb87d0f758..b52327a1844f6e95c4beb8dc80b2c6f786db4787 100644
--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index d64c0e6d4e430a5d3dd9b309afc9d91500147dea..310d3a3719b65be1137c1619c5fe36900c21805f 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/models/utils.py b/tests/models/utils.py
index a43fd77c6d794c5f09d06d353a3e105b1b4f9574..943b4f570446819a6a69ae37f41edcdc49c2cb64 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -13,9 +14,6 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
 from .registry import HF_EXAMPLE_MODELS
 
-if TYPE_CHECKING:
-    from ..conftest import HfRunner
-
 TokensText = tuple[list[int], str]
 
 
@@ -317,6 +315,7 @@ def check_embeddings_close(
                                   dim=0)
 
         fail_msg = (f"Test{prompt_idx}:"
+                    f"\nCosine similarity: \t{sim:.4f}"
                     f"\n{name_0}:\t{embeddings_0[:16]!r}"
                     f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
@@ -337,22 +336,3 @@ class EmbedModelInfo(NamedTuple):
     architecture: str = ""
     dtype: str = "auto"
     enable_test: bool = True
-
-
-def run_embedding_correctness_test(
-    hf_model: "HfRunner",
-    inputs: list[str],
-    vllm_outputs: Sequence[list[float]],
-    dimensions: Optional[int] = None,
-):
-    hf_outputs = hf_model.encode(inputs)
-    if dimensions:
-        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py
index 1a20e2c135c2ef40fb0feda5ae4e11e873eb8ac9..375b248ebedaaed26ed19e90f78373d3d528fcc2 100644
--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 808346b5e58d53186691ab6286e1c6d108c19d27..5ff08cbb32487d4be795ad736662c8e05ba7ce99 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that aborting is handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index e617bd057f1f45ce4fa91e4e16f84dbde122517a..49b02279d61bb4ca2b11f2cd48bcff7e6528282d 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that various errors are handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 2069ff987f2fe1d488b59d6321f21b45d6110749..e9fd5b814f285a220e2ccf93c53b677dff361490 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index 64559609abb2dd74de836a692f4ec842b7dc66aa..7976d5031aea1ad2fd266a98db92eec868bcf563 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import multiprocessing
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index ce716e6474cb49777528bb588539745d0ef0e98f..56e339d485c56337848bab3c8285c0ff6b41750f 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import Optional
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index a823e484beab685f2c367e7cf70e46f2185b0ce1..9f1b3bbe8e22623f095ca723f9f2018422a235f8 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Test the LLMEngine with multi-step-decoding
 
diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 17b36b36888d5b71124230c50383d6b128d64a08..b5048c8cc3ad87e60c890e7c1f1595796e790093 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import numpy as np
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index 56b5475c9ca0454fed2a4d00c20043e90525e8fb..cfd44351a6d1f8deccd9b0301c62b96afecde64c 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import numpy as np
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index f5d3e282f953d602a9b643776f2853b4b6918a35..ffb3a6fe86b46ca03741b896d9afe06a856840be 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 59f7bf8fab2fec40af5d44c298490d1900f2b2fa..8b52911c6ccf3ea029e90971ee14f408669d0e47 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
 from types import MethodType
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index f1e45da30eda4de039150ff399b235a3e9ff182d..5ac0a90f50473c5c3d1416ae304f365dede9f939 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 import mimetypes
@@ -8,12 +9,21 @@ from typing import TYPE_CHECKING, NamedTuple, Optional
 
 import numpy as np
 import pytest
+import torch
+import torch.multiprocessing as mp
 from PIL import Image, ImageChops
 
+from tests.utils import multi_gpu_test
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata)
+                                   merge_and_sort_multimodal_metadata,
+                                   run_dp_sharded_vision_model)
+from vllm.platforms import current_platform
+from vllm.utils import get_open_port, update_environment_variables
 
 if TYPE_CHECKING:
     from vllm.multimodal.hasher import MultiModalHashDict
@@ -140,6 +150,19 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
+@pytest.mark.asyncio
+async def test_fetch_image_error_conversion():
+    connector = MediaConnector()
+    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
+
+    # PIL.UnidentifiedImageError should be converted to ValueError
+    with pytest.raises(ValueError):
+        await connector.fetch_image_async(broken_img)
+
+    with pytest.raises(ValueError):
+        connector.fetch_image(broken_img)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
@@ -399,3 +422,90 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
         assert modalities == expected_modalities
         assert ranges == expected_ranges
         assert hashes == expected_hashes
+
+
+class SimpleLinearModel(torch.nn.Module):
+    """A simple linear vision model for testing."""
+
+    def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32):
+        super().__init__()
+        self.flatten = torch.nn.Flatten()
+        self.linear = torch.nn.Linear(input_dim, output_dim)
+
+    def forward(self, x: torch.Tensor):
+        # Flatten the input and apply linear transformation
+        x = self.flatten(x)
+        return self.linear(x)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "batch_size",
+    [
+        1,  # Single image
+        4,  # Small batch
+        5,  # Odd batch size (for testing padding)
+    ],
+)
+def test_run_dp_sharded_vision_model(batch_size: int):
+    world_size = 2
+    # Launch processes
+    mp.spawn(
+        run_dp_sharded_vision_model_vs_direct,
+        args=(
+            world_size,
+            batch_size,
+            get_open_port(),
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
+                                          batch_size: int, master_port: int):
+    """
+    Test that run_dp_sharded_vision_model produces the same results as 
+    calling the model directly.
+    """
+
+    # Set random seed for reproducibility
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create a test input tensor
+    image_input = torch.randn(batch_size, 3, 224, 224)
+
+    # Create a simple linear model
+    vision_model = SimpleLinearModel()
+
+    # Run the model directly on the full input
+    with torch.inference_mode():
+        direct_output = vision_model(image_input)
+
+    # Run the model through the sharded function
+    with torch.inference_mode():
+        sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
+
+    # Check that the world size is setup correctly
+    assert get_tensor_model_parallel_world_size() == world_size
+
+    # Check that the outputs have the same shape
+    assert direct_output.shape == sharded_output.shape
+
+    # Check that the outputs are close (they should be identical)
+    assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index e67624ecefcb6eecdb4e4b77b5a41749acaa83e4..9a700808d9d8a50fccc61b8f48eb458774958cb4 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import numpy as np
 import numpy.typing as npt
 import pytest
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 40fcfeeeac7d00d43d065ebd9679279c57d1a62b..23346509a06fdfcfd94b4b42533ea42d1bd05fd3 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 from PIL import Image
diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py
index ec2b1238e40425eef837c2f64ce3f3d0ea9380bd..2d6e5f523cb853e30d808d64e0ea0f56462a2eb2 100644
--- a/tests/neuron/1_core/test_activation.py
+++ b/tests/neuron/1_core/test_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index 033a36b4156b09882e47fcfae8a358a88cea703a..efec56360c1424a9524cb325677725f4cf970773 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import neuronxcc.nki.language as nl
 import pytest
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index 3d869cd2fa17f0f10c32bca5fce8c76b8a176f2d..670889ad6b58db6ec7324ed7b5718ca0271d4311 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
index e96df8db6ccdf2a7e0f483ed30ae2ff2b3ebd262..c6fce1d1a06306c672012400c4d9ba20e04821ae 100644
--- a/tests/neuron/1_core/test_layernorm.py
+++ b/tests/neuron/1_core/test_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
index 6d1514088f90ced7b38a307885df0f582e4c4b93..ce9eadf5a883e25d77e5b5aabc99355a587c84b4 100644
--- a/tests/neuron/1_core/test_logits_processor.py
+++ b/tests/neuron/1_core/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import patch
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
index 92417fb64f7f869a989ec43907e1788453cb5c95..5f3268810f9fe3b8ed97f8d95939c0c68a70b33a 100644
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ b/tests/neuron/1_core/test_neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from unittest.mock import MagicMock
 
diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..08630026959281a0cc5e390fe096f0e4c42a2bb8
--- /dev/null
+++ b/tests/neuron/1_core/test_neuron_quant.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.quantization.neuron_quant import (
+    NeuronQuantConfig)
+
+
+def test_get_supported_act_dtypes():
+    neuron_quant_config = NeuronQuantConfig()
+    supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
+    target_list = ["any_dtype1", "any_dtype2"]
+    for dtype in target_list:
+        assert dtype in supported_act_dtypes
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 8f7e711b525e3274db96be668eee03cc86d3f163..8b9a5f6e4a6aff6e22f7badf204b68b69669ab9b 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
index da57631fcfc598ec94770815e631e514e64fae27..a7ac79729986d748d9c03ef3a296554376a72c2c 100644
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
index 3cad160b2cb787b8c0553be769e755bef39eadba..85a48dae58aaf37b443dc64b37dd02dcb0b6d1f8 100644
--- a/tests/neuron/2_core/test_comm_ops.py
+++ b/tests/neuron/2_core/test_comm_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from typing import Callable
 from unittest.mock import patch
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
index d71c88689a9944d4c7f0a6ece763945cca23ca92..cac642af03101ab5d484625b1adf8c66d073fab1 100644
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
index 3e651502d1e2a9f2b607a91a53fabb1ec3e5232a..d02fff943e90a357b9873d64dc8cea93d81f75de 100644
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b97f47d4db34ebd5e80c467dbe04b092aefffe3
--- /dev/null
+++ b/tests/neuron/2_core/test_multi_lora.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def test_llama_single_lora():
+    sql_lora_files = snapshot_download(
+        repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(model="meta-llama/Llama-2-7b-hf",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              use_v2_block_manager=True,
+              override_neuron_config={
+                  "sequence_parallel_enabled": False,
+                  "skip_warmup": True,
+                  "lora_modules": [{
+                      "name": "lora_id_1",
+                      "path": sql_lora_files
+                  }]
+              },
+              enable_lora=True,
+              max_loras=1,
+              max_lora_rank=256,
+              device="neuron")
+    """For multi-lora requests using NxDI as the backend, only the lora_name 
+    needs to be specified. The lora_id and lora_path are supplied at the LLM 
+    class/server initialization, after which the paths are handled by NxDI"""
+    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts,
+                           SamplingParams(top_k=1),
+                           lora_request=[lora_req_1, lora_req_1])
+
+    expected_outputs = [
+        " the head of state and head of government of the United States. "
+        "The president direct",
+        " a city of contrasts. The city is home to the Eiffel Tower"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
+
+
+def test_llama_multiple_lora():
+    sql_lora_files = snapshot_download(
+        repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(model="meta-llama/Llama-2-7b-hf",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              use_v2_block_manager=True,
+              override_neuron_config={
+                  "sequence_parallel_enabled":
+                  False,
+                  "skip_warmup":
+                  True,
+                  "lora_modules": [{
+                      "name": "lora_id_1",
+                      "path": sql_lora_files
+                  }, {
+                      "name": "lora_id_2",
+                      "path": sql_lora_files
+                  }]
+              },
+              enable_lora=True,
+              max_loras=2,
+              max_lora_rank=256,
+              device="neuron")
+    """For multi-lora requests using NxDI as the backend, only the lora_name 
+    needs to be specified. The lora_id and lora_path are supplied at the LLM 
+    class/server initialization, after which the paths are handled by NxDI"""
+    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
+    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts,
+                           SamplingParams(top_k=1),
+                           lora_request=[lora_req_1, lora_req_2])
+
+    expected_outputs = [
+        " the head of state and head of government of the United States. "
+        "The president direct",
+        " a city of contrasts. The city is home to the Eiffel Tower"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cb0f0c3c5fa61ba0a5be54ae374a3e070a82888d..3e2c2577da66caba8ea81016228619bb340560b9 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import shutil
 
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index e3fb6efb275761e2473cb123416a421f859eb605..6307bb63897ac475ad9470594bf375523e0727a7 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 0c431cb39737b1325ab72621afe5ce9d92f88eac..b2085b01c45c1c335b5f2da957b8b1c6edb49ae0 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import ModelRegistry
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index bc4a41cdf00de1deee1d5c697559eedd17b26317..aff3498567d2e579cfcc8a40ae99a33717f4bf5f 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index c23ab64308f204483dd8805296e8c5335f91838b..da97cf7e2b40b9f968f55bc0f97db0873f453f2a 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index bbd11ed4aac9d816bd56949906a67f06ee1ad53b..8c34407e3e0715f0bdce1a6a3b7630b1b4563c69 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index 10df0b5e050350bf21c26d543535308cecc968f6..e40f62f7749beb1ae2d605d66849d357d75e1baa 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index 0d1b062ac2eb549d6fe010a9a444792b45e6b7fd..1b28342eb17916a759317461bb599b38d113daac 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index 33425bbc11ed912900118774cdb6c151df91f823..f30a36f35f5d5da979ca8dee8b8d22254e4c906e 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.attention.backends.flash_attn import FlashAttentionBackend
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 5cefafc7e06c7d51d4a3fe60042685174f29b508..67cd5ed3b73dfd015f78b2e9298a4b1e182a266d 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.platforms.cuda import CudaPlatform
 
diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py
index 8561f2ddfa26626bbc830bfb29d8f296b4a4321a..c8c1b81ca21837db7e182ef1093f378a4f632c76 100644
--- a/tests/plugins_tests/conftest.py
+++ b/tests/plugins_tests/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 207de53abd8d1b471b5152774e42d1878ba4e1bb..685a8cd2c8b82e697b092a5ff7f8f9e0f55435d4 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 4c95a52a967bd7c344157308b0d5f3772a7da472..8c21216108685b8b30cdf37f561f7dc01283d711 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/pplx_utils.py b/tests/pplx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d5d5be80c3f7dec23dcddf6dac0d40bcb17eebd
--- /dev/null
+++ b/tests/pplx_utils.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import os
+import traceback
+from typing import Callable
+
+import torch
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            "tcp://localhost:29500",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+def parallel_launch_from_env(
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    """
+    Launches a worker function in parallel across all processes in the current
+    environment. The environment must have the following variables set:
+    - WORLD_SIZE: The total number of processes.
+    - WORLD_LOCAL_SIZE: The number of processes on the current node.
+    - NODE_RANK: The rank of the current
+    - MASTER_ADDR: The address of the master process.
+    - MASTER_PORT: The port of the master process.
+    """
+    assert not kwargs
+    world_size = int(os.environ["WORLD_SIZE"])
+    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
+    node_rank = int(os.environ["NODE_RANK"])
+    assert "MASTER_ADDR" in os.environ
+    assert "MASTER_PORT" in os.environ
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_local_size,
+            node_rank,
+            "env://",
+            worker,
+        ) + args,
+        nprocs=world_local_size,
+        join=True,
+    )
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 4cc399175df415845e8c66829f073aba3276ffb8..f00a8f6998cbd641323bd6e9e3ce7b96fd221720 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 607b6c43e02e259a720cb8454a828ebc5f133928..a65fc934b16ab2ab6f7aaf185bd3fbf3dbdc7911 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
index a31d8e873d798f3a2adc056d3c2be3c753fc5308..2b603fe8f0228b4edf5d7a0e058680d0273963fa 100644
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
index e249a6e64427a4ed9c4c455f7c21ae46a2062b47..4f273afb4e3688226e12a41be2156bf958d67bd5 100644
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
index fb4c3e1497652fbb00f4f7211577f27c58a81b1c..ba2e15b81bc1edcc234ffbd2796b26fac6d072f6 100644
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from huggingface_hub import snapshot_download
 
diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py
index 81ceecdb45d65a894a838e3e8aa762b0386bcda2..1c41d904b8168f496f72058b5060f7c660da40f1 100644
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
  on the AutoRound.
 
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index e8ddfd7fc77959914c1569d530bd6ffa080a2610..325a902b3111258f841cb13346a031a444974f5d 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c968a68f1a8e8299c1569a5b204c616bf79599b6..d68aa22bed0cff017681eba2212e853cb05944e8 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
@@ -13,9 +14,10 @@ from compressed_tensors.quantization import QuantizationType
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
@@ -650,9 +652,13 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
         assert output
 
 
-def test_compressed_tensors_nvfp4a16(vllm_runner):
-    # run weight only example
-    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
+@pytest.mark.parametrize(
+    "args",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
+      CompressedTensorsW4A16Fp4),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
+def test_compressed_tensors_nvfp4(vllm_runner, args):
+    model, scheme = args
     with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
@@ -661,7 +667,7 @@ def test_compressed_tensors_nvfp4a16(vllm_runner):
             qkv_proj = layer.self_attn.qkv_proj
             assert isinstance(qkv_proj.quant_method,
                               CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+            assert isinstance(qkv_proj.scheme, scheme)
             assert qkv_proj.scheme.group_size == 16
 
         llm.apply_model(check_model)
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index e30166842ea8af44ca3c325457c5bac244accc9e..8b0ffc0fe42f117df2a3d49a671ca73fbc27b711 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether Marlin models can be loaded from the autogptq config.
 
 Run `pytest tests/quantization/test_configs.py --forked`.
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a05eb494c11a7210622fa1041c2a3d17ba267f3b..08d9573ecf0b84967cb40305584c94fada19309a 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index b6db6d5f2fdc5ca47ed5b68fd5d2491790f7aa7d..50179b9a904d24e637fd31ffdf378ffa40063b17 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests experts_int8 quantization startup and generation, 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index e74e14a0dcb64e587b4f4b8b8dd6acbff2e624ff..e5ab7b3dd3cfb80e56f5301521d3cf7ebf29221a 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 22055c49ae296986d10484c0058be5a714216165..23b999e7c679b4d77fbd175d544b8ef1b5998b51 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether gptq models with dynamic quantized can be loaded.
 
 Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 0e3913676f5f7f44352f31660712d14abc2c9f84..34b1b6c2e5b6deb47c1f0d8b10b57efd9119e480 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
  on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
  
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 1c6bd18521c31ab58ceab54a7ae379a21ff52214..11f78a23bb4c063b8085a73a41037449dc0c0470 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether gptq models with quantized lm_head can be loaded.
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 9bbb5e327968fd5ef4ef6c6cd07e59fb7b5d1db9..5f78bc30504c02de90e7249d6a4026df1da2c20b 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether PTPC w8a8 FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index ae09ac58e6759fb888229e085ec38f3567021656..3571f773fb0236bba7f774670be1713f3ba88c15 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and weight loading for quark-quantized models.
 
 Run `pytest tests/quantization/test_quark.py`.
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 0ea71aaf828bc894a59b1325ad46c7652aadd2aa..42081a8c68cdc10305160105f14eed2800fb4e94 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests register custom quantization config.
 
 See https://github.com/vllm-project/vllm/issues/11926 for more details.
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 6571fc9e471bd86e29a3665d3b5e44a2d6721408..54ec59585450745db9b5849467965b04694a4de6 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.metadata
 import importlib.util
 
@@ -12,7 +13,7 @@ TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
-    with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
+    with vllm_runner("drisspg/fp8-opt-125m",
                      quantization="torchao",
                      dtype="bfloat16",
                      enforce_eager=True) as llm:
@@ -29,10 +30,10 @@ def test_pre_quantized_model(vllm_runner):
         "cuda:0",
         # {"": "cuda"},
     ])
-def test_opt_125m_int4wo_model_loading_with_params(vllm_runner,
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
                                                    pt_load_map_location):
     torch._dynamo.reset()
-    model_name = "jerryzh168/opt-125m-int4wo"
+    model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
     with vllm_runner(model_name=model_name,
                      quantization="torchao",
                      dtype="bfloat16",
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 7a339c162cc48a45a8cde13c8c5f77f870169628..20a425b721145ac5ca00dc046698508b352c5c6a 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform
diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 1b669c8fd2fb913162818d57d5381b0d03f1eac9..987f3c48de0c0c592f230f65643870fb4940d91b 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
index 48fb8c2f8d1b9303cc0595be88e0e56e2992a903..38cab73a45f22b711566680c629e4d1d13ef5cc6 100644
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from transformers import AutoTokenizer
 
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index 95b7460d359e49fc4ba8c6173ff95a41f8a1980c..2d5557d5cdc13b260a3df61ebe7952049d872135 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index 0f894ed800c6c2ba1e21fcb44d6194f628ea8ece..ddcf89796fb5a4aff353ad42315cda05f5338a60 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
index 8b96184f579e4ada97ee2a81dee2e9c233af9d6c..e27d9958f291726ff568f4b7b42eed14c659986e 100644
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
 from vllm.config import LoadConfig, LoadFormat
diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
index 06e506c35761eaa714df418508556c3b7e27a9a5..ee448c2ccb213fb8f6c6709f862fa9c2aeb1eb47 100644
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import tempfile
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 5de1137eaf682659560eb3a9724b26f14e3fabb7..bdf48c7687b2558c75b3f15aa45d87d6d19d6afc 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using beam search.
 
 Run `pytest tests/samplers/test_beam_search.py`.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 2a124aa0c5960ff1025b8f64d9c350e790a75846..7eb9c0b5fb8c88f26eec55308c13f0ecfea10266 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure ignore_eos works.
 
 Run `pytest tests/samplers/test_ignore_eos.py`.
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 74f1eb4a9547736a8b8341f234809e5bfc32d273..901c8759126431d275ad1330acfa20b53c6bc731 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 5cc646e76ec848fffc9655c6dac28c5832e7b089..86c8a03eee10f9e9e8cc0337aab625c1c40ad72a 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 355e3adcf5f30f3c263c9e9177a338a264bfa727..42b529ae169ded13726c855b0b14402e748fe138 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure bad_words works.
 
 Run `pytest tests/samplers/test_no_bad_words.py`.
@@ -103,7 +104,7 @@ class TestTwoTokenBadWord:
                                                 add_special_tokens=False)[0]
 
     def test_two_token_bad_word(self, vllm_runner):
-        with vllm_runner(self.MODEL) as llm:
+        with vllm_runner(self.MODEL, dtype="half") as llm:
             output_token_ids = self._generate(llm)
             assert output_token_ids[:2] == [
                 self.target_token_id1, self.target_token_id2
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ebe9b302148c0c5ee6aaa617135ad3da5c0a12bb..86fc14dc85f8016002b7e6b087da60b871841018 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 6ef61f2ff40696ccc38e268bd6b8b22b4038a12f..3b93c64113daccc1fc42cdf3fcbb587958e14091 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 7b19d5750906dcf562ffd21e916e18ab34899f02..520b88d03ac8ebd03c44b67b10d3579d4fc166fd 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import random
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index efa2642dba9713ef7ddeecf43f6820401a5ea23c..b339b4b2ddf3d4552d2c2de14fff118a38f6b12d 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Verify that seeded random sampling is deterministic.
 
 Run `pytest tests/samplers/test_seeded_generate.py`.
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 279e5ed100d975f0f60955e83053aa2c4da3d291..418471b8e5238e1223d0690dcc8ff69940b39fb4 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py
index 1a20e2c135c2ef40fb0feda5ae4e11e873eb8ac9..375b248ebedaaed26ed19e90f78373d3d528fcc2 100644
--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 921081f3c3f2e11f8a006e68c4a2c8744f5fd6fb..f3fe9db3f79ea3b5afb2a797436cb7893d47ab0e 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from itertools import cycle
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 4fd52cf7e2cb31d0af10bdb5c8e4fb367a2eff5a..6c453879a6a6acb46829aa04c3c42329f23e0103 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index eee535a146f4518dd3420b53bbfc966ca83ada7b..98939461422e1ca240753fc0e537f385bae47db7 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index 9dfc1b2fd91efaadb1c8bbeccd7e4477e34567b8..7608618502966265ce29c055242c2c5edc0222ea 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 other features, e.g. cuda graphs.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b112974754208361cc6ff4af781efca2b93ab053..a18be80c50dd9e435bfe984d49b795b36b88dac1 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index a1b7c8b40c39ddda78963f59db312860636f35aa..039eec8fd2cc9dfc0546b15707c4d0e424f45334 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index cb2dae541411af14979ac80f1c0f1185349f77b6..1629c69f8ee9df2a538acfe1283f6b78a7c96b1e 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import cycle
 
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index 5c60100e6797e90390f820ddfc49a6f36fec6818..064a6e10ae6ef286afee009b799a26216ac26ad3 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 7bf29349d67248fa4f75959e5f5106fd56df4356..9f778ca8d179b2d25f33c24a9bc72fc49f4cb7f2 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py
index 371e6834b6398a5696a4c413a5d175b2f917d246..d4d4d519b7a14e391f60d6cc73fbcf66f81fb1d5 100644
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index e187b6bc1434780f0e476e4f2d41614a3dfd864b..6d385184d264a266be13f31d4b77964d61a340e5 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """The tests in this file verify end-to-end speculative decoding correctness.
 
 This docstring details important information on the testing methodology.
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index eca433ffa1d0b08d9debce3166aa6424946b0e0e..c10329a9ba974d60e22fd96cef203aca5a2ec428 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 3dc37172285e98ddcad435837873f4cd310f7d5f..4cf373809dba278fb52a0c5160a927df5e46103a 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 9edd8bd4c00d7b7e7ab38cfa2acfcd902d14a956..d20c549b09052f32f11a9b1f946ae2676d92e9cb 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 0bff0ea1d7dba513215b774bfc90057c66415e6d..407786ad3c6471843f5c850cb9d0f2ed9a1accc3 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py
index 16dffe6d7d699590489daf28fd45c82076220970..5d9dd3f72a78aa0d76eb64b82c893010671858ee 100644
--- a/tests/spec_decode/test_memory_usage.py
+++ b/tests/spec_decode/test_memory_usage.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 This test verifies that memory usage remains constant (or never grows) when 
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 1a6693e168173b9ea46fb274cb2b3d25c1885e91..e8de410f8a941a3b7c6e7d7869dbbc99a1ab68e8 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index ca37c9a68dfa49912065f6913bdf146709581eca..f2d93203b8e10e7b6ba4ac5ee4ef16a35dbac11b 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index 7de54b3edb6c600b49ccbda81a7b5056e8b7dce3..8a7c1148568116b652c43ba6ee513ad1666f80d7 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index f73cf4b345fb264992e6e4c8a1d74835391736e8..55fcf00557476f314f3d86205223fb212bea6510 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index f7ef9786a690e019d64e888764cc06430a971c1a..8aceaadff8d38256a046a9adfb1d5a946fee2127 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from collections import defaultdict
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 24573e22487d01d8b78d1d4541d27df2dac935aa..9cfc618b9d950c25b6b43612712d0cb01649dbf1 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index d303b7f1219a539f67aefcc9fc2132d5fc04ec03..1733f66feec073322f1b455ba5d448ed35601e8b 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence as GenericSequence
 from itertools import count
diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index 61e3b387973bc66240579f529cf57becc453e30a..21bcb6b822d1f3088e0cc9cc8f441e6c2a304e7a 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index ce8689f5b89c11b79f81cc2948a87bc07f2c0142..cd59d579e8d6fe3348e679900f0d92d359154bd2 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b6286e1483976cd9bb105de8b169025372f144e2..c97f5968d58a2540fdb3d31cce727edcaf2a12d4 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gc
 import os
 import pathlib
 import subprocess
-from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
@@ -16,7 +16,6 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          TensorSerializer,
                                                          is_vllm_tensorized,
-                                                         load_with_tensorizer,
                                                          open_stream,
                                                          tensorize_vllm_model)
 # yapf: enable
@@ -61,21 +60,6 @@ def write_keyfile(keyfile_path: str):
         f.write(encryption_params.key)
 
 
-@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
-def test_load_with_tensorizer(mock_agent, tensorizer_config):
-    mock_linear_method = MagicMock()
-    mock_agent_instance = mock_agent.return_value
-    mock_agent_instance.deserialize.return_value = MagicMock()
-
-    result = load_with_tensorizer(tensorizer_config,
-                                  quant_method=mock_linear_method)
-
-    mock_agent.assert_called_once_with(tensorizer_config,
-                                       quant_method=mock_linear_method)
-    mock_agent_instance.deserialize.assert_called_once()
-    assert result == mock_agent_instance.deserialize.return_value
-
-
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 05d2c624df1785d81b8da3ae0ac49d86ce34275e..edc0849dff33f8dc84a30055ae930ac05bfd210a 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test hashing of cache blocks.
 
 Run `pytest tests/test_cache_block_hashing.py`.
diff --git a/tests/test_config.py b/tests/test_config.py
index 7db95e3f64502dc41e0d3522e796a09e58586dce..ce383e1b420af765a2b8e6b7a8122c85c325cc94 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import MISSING, Field, asdict, dataclass, field
 from typing import Literal, Union
 
 import pytest
 
+from vllm.compilation.backends import VllmBackend
 from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
                          config, get_field)
 from vllm.model_executor.layers.pooler import PoolingType
@@ -43,6 +45,18 @@ def test_config(test_config, expected_error):
         config(test_config)
 
 
+def test_compile_config_repr_succeeds():
+    # setup: VllmBackend mutates the config object
+    config = VllmConfig()
+    backend = VllmBackend(config)
+    backend.configure_post_pass()
+
+    # test that repr(config) succeeds
+    val = repr(config)
+    assert 'VllmConfig' in val
+    assert 'inductor_passes' in val
+
+
 def test_get_field():
 
     @dataclass
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index a9b4f5cbf78c3a68d229bbf9a8a57a123a5366ec..b9593e2a3b7c04dbc60dd34b622cca5093031529 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index d361808ed2f9a67f8c4e1c88a3e54284235f3de0..e549834faf6f71659a6584988be38706ee68939a 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 046f70504c899f4dbebf4e774f87e133f6cf2fee..8f235f1474fe275a2018f215dc31b3e4c3947e7e 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import json
 import logging
diff --git a/tests/test_outputs.py b/tests/test_outputs.py
index c41bd6723ba113c38a56b64c12039c36feab3304..4bb1c20f77f1d75e5fc7d7caf39c088bf0b3e6df 100644
--- a/tests/test_outputs.py
+++ b/tests/test_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.outputs import RequestOutput
 
diff --git a/tests/test_regression.py b/tests/test_regression.py
index e092945422edb59e110113e143fda4b380c85b84..f5f1ed8e805e00aaaee1e236638caf0dcaf07743 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Containing tests that check for regressions in vLLM's behavior.
 
 It should include tests that are reported by users and making sure they
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 9af810c4c1bca7aec284fa17ff1fa255e4840686..39e3808d831ca8e9dc0ef34124dc71a8e1b8c3e6 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the SamplingParams class.
 """
 
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index eecfa1db3d7e5adf00175632beaf33ef2ffdf4f7..ef4aef3afc2e20b01e77e3134feb08c83da809ae 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index c45ed6926d7725dfd834d922657fa84f8c68a2be..e9138b9e8eb6138f7e8787307b99873c7866c02b 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 
 import numpy as np
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 902de1099e6051bc56b4775893ffa52e6b224f93..a782a3bf7716bac864aa9fd31d7dd73f119629a5 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 77fec0968000f4f5ebd583193ef61b25275adb42..64706defb5960518f68b0f8232b320eeb223bdd7 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing as mp
 import os
diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py
index eb8ad48fdead4c014720b40eac41e42c5949a053..64f72668f29ce0d59f2b9eca4c72b0dbcde810db 100644
--- a/tests/test_triton_utils.py
+++ b/tests/test_triton_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 import types
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b88d05efeaad56a7f06c37eb93d4844732ec2aa..a2fd845ea54b78f74d0a010bd43fc3bb1cd0df4a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 
 import asyncio
@@ -17,7 +18,8 @@ from vllm_test_utils.monitor import monitor
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                         MemorySnapshot, PlaceholderModule, StoreBoolean,
-                        bind_kv_cache, deprecate_kwargs, get_open_port,
+                        bind_kv_cache, common_broadcastable_dtype,
+                        deprecate_kwargs, get_open_port, is_lossless_cast,
                         make_zmq_path, make_zmq_socket, memory_profiling,
                         merge_async_iterators, sha256, split_zmq_path,
                         supports_kw, swap_dict_values)
@@ -258,11 +260,18 @@ def test_dict_args(parser):
         "--model-name=something.something",
         "--hf-overrides.key1",
         "val1",
+        # Test nesting
         "--hf-overrides.key2.key3",
         "val2",
         "--hf-overrides.key2.key4",
         "val3",
+        # Test = sign
         "--hf-overrides.key5=val4",
+        # Test underscore to dash conversion
+        "--hf_overrides.key_6",
+        "val5",
+        "--hf_overrides.key-7.key_8",
+        "val6",
     ]
     parsed_args = parser.parse_args(args)
     assert parsed_args.model_name == "something.something"
@@ -273,6 +282,10 @@ def test_dict_args(parser):
             "key4": "val3",
         },
         "key5": "val4",
+        "key_6": "val5",
+        "key-7": {
+            "key_8": "val6",
+        },
     }
 
 
@@ -567,12 +580,65 @@ def test_lru_cache():
     assert 6 in cache
 
 
+# yapf: disable
+@pytest.mark.parametrize(
+    ("src_dtype", "tgt_dtype", "expected_result"),
+    [
+        # Different precision_levels
+        (torch.bool, torch.int8, True),
+        (torch.bool, torch.float16, True),
+        (torch.bool, torch.complex32, True),
+        (torch.int64, torch.bool, False),
+        (torch.int64, torch.float16, True),
+        (torch.int64, torch.complex32, True),
+        (torch.float64, torch.bool, False),
+        (torch.float64, torch.int8, False),
+        (torch.float64, torch.complex32, True),
+        (torch.complex128, torch.bool, False),
+        (torch.complex128, torch.int8, False),
+        (torch.complex128, torch.float16, False),
+        # precision_level=0
+        (torch.bool, torch.bool, True),
+        # precision_level=1
+        (torch.int8, torch.int16, True),
+        (torch.int16, torch.int8, False),
+        (torch.uint8, torch.int8, False),
+        (torch.int8, torch.uint8, False),
+        # precision_level=2
+        (torch.float16, torch.float32, True),
+        (torch.float32, torch.float16, False),
+        (torch.bfloat16, torch.float32, True),
+        (torch.float32, torch.bfloat16, False),
+        # precision_level=3
+        (torch.complex32, torch.complex64, True),
+        (torch.complex64, torch.complex32, False),
+    ],
+)
+# yapf: enable
+def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
+    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("dtypes", "expected_result"),
+    [
+        ([torch.bool], torch.bool),
+        ([torch.bool, torch.int8], torch.int8),
+        ([torch.bool, torch.int8, torch.float16], torch.float16),
+        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_common_broadcastable_dtype(dtypes, expected_result):
+    assert common_broadcastable_dtype(dtypes) == expected_result
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
     def build_ctx():
-        return pytest.raises(ModuleNotFoundError,
-                             match="No module named")
+        return pytest.raises(ModuleNotFoundError, match="No module named")
 
     with build_ctx():
         int(placeholder)
@@ -608,6 +674,7 @@ def test_placeholder_module_error_handling():
         _ = placeholder_attr.module
 
 
+# yapf: disable
 @pytest.mark.parametrize(
     "obj,key1,key2",
     [
@@ -618,6 +685,7 @@ def test_placeholder_module_error_handling():
         # Tests for both keys do not exist
         ({1: "a", 2: "b"}, 3, 4),
     ])
+# yapf: enable
 def test_swap_dict_values(obj, key1, key2):
     original_obj = obj.copy()
     swap_dict_values(obj, key1, key2)
@@ -631,19 +699,19 @@ def test_swap_dict_values(obj, key1, key2):
         assert key1 not in obj
 
 
-def test_model_specification(parser_with_config,
-                             cli_config_file,
+def test_model_specification(parser_with_config, cli_config_file,
                              cli_config_file_with_model):
     # Test model in CLI takes precedence over config
-    args = parser_with_config.parse_args([
-        'serve', 'cli-model', '--config', cli_config_file_with_model
-    ])
+    args = parser_with_config.parse_args(
+        ['serve', 'cli-model', '--config', cli_config_file_with_model])
     assert args.model_tag == 'cli-model'
     assert args.served_model_name == 'mymodel'
 
     # Test model from config file works
     args = parser_with_config.parse_args([
-        'serve', '--config', cli_config_file_with_model,
+        'serve',
+        '--config',
+        cli_config_file_with_model,
     ])
     assert args.model == 'config-model'
     assert args.served_model_name == 'mymodel'
@@ -654,17 +722,19 @@ def test_model_specification(parser_with_config,
 
     # Test using --model option raises error
     with pytest.raises(
-        ValueError,
-        match=(
-            "With `vllm serve`, you should provide the model as a positional "
-            "argument or in a config file instead of via the `--model` option."
-        ),
+            ValueError,
+            match=
+        ("With `vllm serve`, you should provide the model as a positional "
+         "argument or in a config file instead of via the `--model` option."),
     ):
         parser_with_config.parse_args(['serve', '--model', 'my-model'])
 
     # Test other config values are preserved
     args = parser_with_config.parse_args([
-        'serve', 'cli-model', '--config', cli_config_file_with_model,
+        'serve',
+        'cli-model',
+        '--config',
+        cli_config_file_with_model,
     ])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code is True
@@ -673,7 +743,7 @@ def test_model_specification(parser_with_config,
 
 
 @pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
-                                    (None, bool, [1, 2, 3])])
+                                   (None, bool, [1, 2, 3])])
 @pytest.mark.parametrize("output", [0, 1, 2])
 def test_sha256(input: tuple, output: int):
     hash = sha256(input)
@@ -682,7 +752,8 @@ def test_sha256(input: tuple, output: int):
     assert hash != 0
 
     bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(),
+                                  byteorder="big")
 
     # hashing again, returns the same value
     assert hash == sha256(input)
@@ -698,8 +769,7 @@ def test_sha256(input: tuple, output: int):
         ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
         ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
         ("inproc://some_identifier", ("inproc", "some_identifier", "")),
-    ]
-)
+    ])
 def test_split_zmq_path(path, expected):
     assert split_zmq_path(path) == expected
 
@@ -711,8 +781,7 @@ def test_split_zmq_path(path, expected):
         "tcp://127.0.0.1",  # Missing port
         "tcp://[::1]",  # Missing port for IPv6
         "tcp://:5555",  # Missing host
-    ]
-)
+    ])
 def test_split_zmq_path_invalid(invalid_path):
     with pytest.raises(ValueError):
         split_zmq_path(invalid_path)
@@ -734,7 +803,8 @@ def test_make_zmq_socket_ipv6():
     zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
 
     # Verify that the IPV6 option is set
-    assert zsock.getsockopt(zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
+    assert zsock.getsockopt(
+        zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
 
     # Clean up
     zsock.close()
diff --git a/tests/test_version.py b/tests/test_version.py
index 56842b6d409d3489a4ea678169cf7540e0897ca8..fd07abb59b1f86ff47ce734813588b055c2b2502 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py
index ccbb36bf4c06cb1fb5a0ff7d4a33e5e00285e7d1..88e1efd8fdbb69557115a5bfc2dd8dfbebfb42e6 100644
--- a/tests/test_vllm_port.py
+++ b/tests/test_vllm_port.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from unittest.mock import patch
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index c740fde42636004fb01d38304548e351da1e9522..e218678c4363b962c4044ad2eb3ed30eea45bd4e 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pickle
 from copy import deepcopy
 
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 079100e78b5f08aa2f87d6cfb726f68da88595b0..9f2414eca24f3f94f7cb0249808753469bcab5a4 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Generator
 from typing import Any, Optional
@@ -69,7 +70,8 @@ def _run_incremental_decode(tokenizer,
                                 None,
                                 0.0,
                                 None,
-                                cache_salt=None)
+                                cache_salt=None,
+                                data_parallel_rank=None)
 
     if fast is None:
         detokenizer = IncrementalDetokenizer.from_new_request(
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index 8942f889128300b1f7addcf94bf6d4959f62263b..d8288429351c4bb79c38ec62b4f696afd351ef4a 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index b16d9af35be989d51e583215f3f86fddf7f002f7..69b3c6294284bebe2f972e4663cf7f7f4cd26e93 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from mistral_common.protocol.instruct.messages import (AssistantMessage,
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index eddc630986ea557be282353fcab6c1a0c8043f36..09a3638fd2ed11a9aa3f04c6826f1cf04c6fb229 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import PreTrainedTokenizerBase
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index bcfa78ed41cf5e903970112e8cf7a7425610b1de..0570c1525e1115ebd7369b5b85c01dc7cf7c5095 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index 772eeb345ca4d3a45e0a5d1e11154247e94c5c30..5abb1016440865fb5984f1c8286c508c575456b6 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Any, Optional, Union
 
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 4bf9b45fe212bb09a7bd6cf19b8484c0f61a5902..510b54790cd90378b4635c06e7ecbb5346152702 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index ba0ad78f646752dc00718d4a789342911061ce60..a30c58b09fe8f61c1beb29b8d413a4f51a5537e1 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 448347be6ec1df640b3f07a9d9693cd55a16842c..8c01c86e29f2f423ad5987214a1123ea2c611014 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index a40675744ba2439eef2e2b698e1d0dfe79dff914..35153139350bf1dd2769be25539bb4bc29879b3e 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Generator
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 910e0b2d51ab6f510bab9c2fca027b0e5f1dc27f..fff20c68d6212f95efe74803da76297b52131e1d 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Optional
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index b320b335e338cc063dffd29d6ad90d89055c7901..53ba03a0ae1091d83c675a0c61371ebddfb92c31 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Optional
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 29176984814535d9110c9e95ccadd984285117ad..3b43b723d43874ba7869ef08b2406950027dbabe 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from copy import deepcopy
 from unittest.mock import MagicMock
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index efa6455c41df717f72804700a474c93622add49d..a17fab9aecbcab5e37be5175912ae2675af60fee 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
 from typing import Any, Optional
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 21d7fce691c95eec1d4f784effae2613b0df2207..b26bdd34d890e22aa51b745ab9fd322f6f216eba 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 import vllm
diff --git a/tests/tpu/lora/test_pallas_kernels.py b/tests/tpu/lora/test_pallas_kernels.py
deleted file mode 100644
index 8bd47de50c340f4e6979078224090d350bec9d39..0000000000000000000000000000000000000000
--- a/tests/tpu/lora/test_pallas_kernels.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import torch
-
-# Required to register the custom ops
-import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
-
-N_TOKENS = [16, 1024, 4096]
-HIDDEN_SIZES = [1024, 2048, 4096]
-
-DTYPES = [torch.bfloat16]
-NUM_LORA = [1, 4, 16]
-RANKS = [32, 256, 512]
-
-
-def generate_test_data(T, D, L, N, seed, dtype=torch.float32):
-    """
-    Inputs: (All integers)
-        T: Total number of tokens
-        D: Input dim
-        L: LoRA Dim
-        N: N LoRAs
-    
-    Outputs:
-        inputs:     torch.Tensor - shape (T, D)
-        loras:      torch.Tensor - shape (N, 1, L, D)
-        idxs:       torch.Tensor - shape (T, ) - all values must be in [0, N)
-        
-        ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T
-    """
-    torch.manual_seed(seed)
-
-    inputs = torch.randn((T, D), device="xla", dtype=dtype)
-    loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype)
-    idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla")
-
-    ref_output = ref_bgmv(inputs, loras, idxs)
-    return inputs, loras, idxs, ref_output
-
-
-def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor):
-    selected_loras = loras[idxs]
-    if len(selected_loras.shape) == 4:
-        selected_loras = selected_loras.squeeze(axis=1)
-
-    batch_size, output_size, input_size = selected_loras.shape
-    return (selected_loras @ inputs.reshape(
-        (batch_size, input_size, 1))).reshape((batch_size, output_size))
-
-
-# Parameterize tests with various shapes and dtypes
-@pytest.mark.parametrize("T", N_TOKENS)
-@pytest.mark.parametrize("D", HIDDEN_SIZES)
-@pytest.mark.parametrize("L", RANKS)
-@pytest.mark.parametrize("N", NUM_LORA)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", [0])
-def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed):
-    if op_type == "expand":
-        D, L = L, D
-
-    inputs, loras, idxs, ref_output = generate_test_data(
-        T, D, L, N, seed, dtype)
-
-    # Run bgmv
-    output = torch.ops.xla.bgmv(inputs, loras, idxs)
-
-    # Make sure we have no NaNs
-    assert not torch.any(torch.isnan(output))
-
-    # Compare with reference output
-    assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2)
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 06e00187caf46288a2e661eeb687fabe1ce71435..448b8b2bc094f1f3edd065ed8403a6340f9c35d7 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import os
@@ -63,9 +64,10 @@ def test_tpu_compilation():
         numbers = [int(part) for part in parts if part.isdigit()]
         return numbers[0]
 
-    # Check all the compilations are as expected
+    # Check all the compilations are as expected. The dump files include the
+    # captured graph for the forward function of the nn.Module.
     compiled_fns = sorted(glob.glob(
-        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+        os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
                           key=lambda s: extract_compiled_index(s))
 
     for i, compiled_fn in enumerate(compiled_fns):
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index acb6b90f5f7f6419e65f1f5a2ad674a947b7caa1..9c90df1b77010dee144f334748ab8a8a7b8c41e4 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index 13fc8bc8fa2ed0b634277a3feaa47a3f323215de..407a824d8174804f3fae2e56cce46b8e82dc53ce 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the Pallas MOE implementation.
 
 Run `pytest tests/kernels/moe/test_moe_pallas.py`.
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 20f9dd77d0e8d55bf823a69d374c302d7eaf31d7..a13cf7064d54b966322b67bec243b8d2d57090dc 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index a781b8b563be1c8b05facb4986951b4112c41911..4dbae7c15de3a0291eca3971578f50bec1cd8c36 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 # type: ignore
 from __future__ import annotations
@@ -173,7 +174,7 @@ def test_traces_with_detailed_steps(
         llm = LLM(
             model=model,
             otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-            collect_detailed_traces="all",
+            collect_detailed_traces=["all"],
         )
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
diff --git a/tests/utils.py b/tests/utils.py
index bf38d7843853d8bef662185f8d5d54de03d3f4fc..ade28a481261ce40d7d164711f8b88b22939663b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
@@ -28,7 +29,7 @@ from tests.models.utils import TextTextLogprobs
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -99,7 +100,8 @@ class RemoteOpenAIServer:
 
         parser = FlexibleArgumentParser(
             description="vLLM's remote OpenAI server.")
-        parser = make_arg_parser(parser)
+        subparsers = parser.add_subparsers(required=False, dest="subparser")
+        parser = ServeSubcommand().subparser_init(subparsers)
         args = parser.parse_args(["--model", model, *vllm_serve_args])
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 43a27da2dbe43436d3f6369a55d852d827c05f8e..ab7aa02823ab93305f76f65628fceccc42220894 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
 
 import pytest
@@ -11,13 +12,11 @@ from vllm.utils import GiB_bytes, sha256
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
-from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock,
-                                         PrefixCachingMetrics,
-                                         estimate_max_model_len,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens,
-                                         hash_request_tokens,
-                                         unify_kv_cache_configs)
+from vllm.v1.core.kv_cache_utils import (
+    FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
+    estimate_max_model_len, generate_block_hash_extra_keys,
+    get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
+    hash_block_tokens, hash_request_tokens, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -45,7 +44,6 @@ def make_request(request_id,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
-        arrival_time=0,
         lora_request=None,
         cache_salt=cache_salt,
     )
@@ -65,6 +63,20 @@ def new_kv_cache_spec(block_size=16,
                              sliding_window=sliding_window)
 
 
+def new_sliding_window_spec(block_size=16,
+                            num_kv_heads=2,
+                            head_size=64,
+                            dtype=torch.float32,
+                            use_mla=False,
+                            sliding_window=1):
+    return SlidingWindowSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla,
+                             sliding_window=sliding_window)
+
+
 def test_none_hash(monkeypatch):
     import vllm.v1.core.kv_cache_utils
 
@@ -101,8 +113,8 @@ def test_kv_cache_block():
     assert block.ref_cnt == 0
 
     # Test block hash setting and resetting
-    block_hash = vllm.v1.core.kv_cache_utils.BlockHashType(hash_value=123,
-                                                           token_ids=(1, 2, 3))
+    block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123,
+                                                       token_ids=(1, 2, 3))
     block.block_hash = block_hash
     assert block.block_hash == block_hash
 
@@ -283,7 +295,7 @@ def test_hash_block_tokens(hash_fn):
 
     block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                    curr_block_token_ids, extra_keys)
-    assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHashType)
+    assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash)
     assert block_hash.hash_value == hash_fn(
         (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
@@ -307,10 +319,8 @@ def test_hash_request_tokens(hash_fn):
     block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
-    assert isinstance(block_hashes[0],
-                      vllm.v1.core.kv_cache_utils.BlockHashType)
-    assert isinstance(block_hashes[1],
-                      vllm.v1.core.kv_cache_utils.BlockHashType)
+    assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
+    assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
 
     # Check the first block
     assert block_hashes[0].token_ids == (0, 1, 2)
@@ -407,10 +417,10 @@ def test_unify_kv_cache_configs():
     same_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -419,10 +429,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -437,10 +447,10 @@ def test_unify_kv_cache_configs():
     need_sort_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -449,10 +459,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer2"],
                                  new_kv_cache_spec(num_kv_heads=4)),
@@ -468,10 +478,10 @@ def test_unify_kv_cache_configs():
     diff_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -480,10 +490,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -599,14 +609,92 @@ def test_estimate_max_model_len(model_id, max_model_len,
     assert estimated_max_len == want_estimated_max_len
 
 
+def test_get_max_concurrency_for_kv_cache_config():
+    # Create a VllmConfig
+    model_id = "Qwen/Qwen1.5-7B"
+    max_model_len = 16384
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
+                                       enable_chunked_prefill=True)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    full_attention_spec = FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+    )
+
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+        sliding_window=1024,
+    )
+
+    kv_cache_config_full_attention = KVCacheConfig(
+        num_blocks=int(1024 * 1.5),
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+        ],
+    )
+    max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_full_attention)
+    assert max_concurrency_full_attention == 1.5
+
+    kv_cache_config_sliding_window = KVCacheConfig(
+        num_blocks=129 * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_sliding_window)
+    assert max_concurrency_sliding_window == 3
+
+    kv_cache_config_hybrid_model = KVCacheConfig(
+        num_blocks=(1024 + 129) * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_hybrid_model)
+    assert max_concurrency_hybrid_model == 3
+
+
 def test_allocate_with_lookahead():
     """Verify that lookahead tokens correctly affect block allocation"""
     block_size = 4
     config = KVCacheConfig(
         num_blocks=10,
-        tensors={
-            "layer1": KVCacheTensor(100),
-        },
+        kv_cache_tensors=[
+            KVCacheTensor(size=100, shared_by=["layer1"]),
+        ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer1"],
                              new_kv_cache_spec(block_size=block_size)),
@@ -628,7 +716,7 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
     )
-    assert len(blocks.blocks) == 2  # ceil(5/4)=2 blocks
+    assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
@@ -639,7 +727,7 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=2,
     )
-    assert len(blocks.blocks) == 2
+    assert len(blocks.get_block_ids()[0]) == 2
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
@@ -650,4 +738,165 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=4,
     )
-    assert len(blocks.blocks) == 2
+    assert len(blocks.get_block_ids()[0]) == 2
+
+
+def test_get_kv_cache_config():
+    # pass max_model_len to pass check_enough_kv_cache_memory
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    # all layers are full attention -> single group
+    kv_cache_specs_full = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+    }
+    kv_cache_config_full = get_kv_cache_config(
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_full == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
+        ])
+
+    # all layers are sliding window -> single group
+    kv_cache_specs_sliding = {
+        'layer_1': new_sliding_window_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_sliding = get_kv_cache_config(
+        vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_sliding == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_sliding_window_spec())
+        ])
+
+    # full + sliding, but disable_hybrid_kv_cache_manager
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = True
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"],
+                             new_kv_cache_spec(sliding_window=1)),
+        ],
+    )
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = False
+
+    # full + sliding, with hybrid_kv_cache_manager
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=64,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 64,
+                          shared_by=["layer_1", "layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_2"], new_sliding_window_spec()),
+        ],
+    )
+
+    # 2 full + 4 sliding, 2 layers per group
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_sliding_window_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1", "layer_3", "layer_5"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_4", "layer_6"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_3", "layer_4"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+        ],
+    )
+
+    # 3 full + 7 sliding, pad to 3 full + 9 sliding
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_kv_cache_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+        'layer_7': new_sliding_window_spec(),
+        'layer_8': new_sliding_window_spec(),
+        'layer_9': new_sliding_window_spec(),
+        'layer_10': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_5", "layer_8"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_3", "layer_6", "layer_9"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
+                             new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()),
+        ],
+    )
+
+    # different hidden size, unimplemented
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(head_size=128),
+        'layer_2': new_kv_cache_spec(),
+    }
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_config(vllm_config, kv_cache_specs_hybrid,
+                            mem_per_block_per_layer * 2 * 32)
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 3da27786b1f2f9bd3e9e96f945a43cc1b8e73f32..394336624aca85de95ba550582519b0b67544b04 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching."""
 
+import copy
 from typing import Optional
 
 import pytest
@@ -12,8 +14,8 @@ from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
-                                         hash_block_tokens)
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock, hash_block_tokens)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
@@ -38,7 +40,6 @@ def make_request(request_id,
         sampling_params=SamplingParams(max_tokens=17,
                                        prompt_logprobs=prompt_logprobs),
         eos_token_id=100,
-        arrival_time=0,
         lora_request=None,
         cache_salt=cache_salt,
     )
@@ -47,7 +48,7 @@ def make_request(request_id,
 def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     return KVCacheConfig(
         num_blocks=num_blocks,
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer"],
@@ -57,6 +58,38 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     )
 
 
+def make_kv_cache_config_hybrid_model(block_size: int,
+                                      num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+            KVCacheGroupSpec(
+                ["layer3"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+        ],
+    )
+
+
 @pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
 def test_prefill(hash_algo):
     manager = KVCacheManager(
@@ -79,12 +112,12 @@ def test_prefill(hash_algo):
     req0 = make_request("0", all_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(manager.req_to_block_hashes[req0.request_id]) == 3
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     # Check full block metadata
     parent_block_hash = None
@@ -92,7 +125,8 @@ def test_prefill(hash_algo):
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -107,14 +141,14 @@ def test_prefill(hash_algo):
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
-    for block in computed_blocks.blocks:
+    assert blocks.get_block_ids() == ([5], )
+    for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
@@ -141,13 +175,13 @@ def test_prefill(hash_algo):
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[6]]
+    assert blocks.get_block_ids() == ([6], )
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
@@ -165,18 +199,150 @@ def test_prefill(hash_algo):
     # Cache miss and eviction.
     req3 = make_request("3", [99] * (16 * 10))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 10,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     # This block ID order also checks the eviction order.
-    assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]]
+    assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
 
 
+def test_prefill_hybrid_model():
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config_hybrid_model(block_size, 21),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    hash_fn = hash
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7,
+                                                     8], [9, 10, 11, 12])
+
+    # Check full block metadata
+    parent_block_hash = None
+    for length, block_ids in zip((1, 2, 3),
+                                 ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
+        block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
+        for block_id in block_ids:
+            assert manager.block_pool.blocks[
+                block_id].block_hash.block_hash == block_hash
+            assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial block metadata
+    for block_id in (4, 8, 12):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Cache hit in the common prefix
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
+                                                           7], [0, 10, 11])
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == ([13], [14], [15])
+    for block_per_group in computed_blocks.blocks:
+        for block in block_per_group:
+            if block != manager.block_pool.null_block:
+                assert block.ref_cnt == 2
+
+    block_hashes = manager.req_to_block_hashes[req1.request_id]
+    manager.free(req0)
+    manager.free(req1)
+
+    cached_block_hash_to_block_bak = copy.copy(
+        manager.block_pool.cached_block_hash_to_block)
+
+    def test_partial_request_hit(request_id: str,
+                                 hash_to_evict: list[BlockHashWithGroupId],
+                                 expect_hit_length: int):
+        req = make_request(request_id, common_token_ids + unique_token_ids)
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block.pop(
+                hash_with_group_id)
+        computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+        assert len(manager.req_to_block_hashes[req.request_id]) == 3
+        assert num_computed_tokens == expect_hit_length * block_size
+        for block_per_group in computed_blocks.blocks:
+            assert len(block_per_group) == num_computed_tokens // block_size
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block[
+                hash_with_group_id] = cached_block_hash_to_block_bak[
+                    hash_with_group_id]
+        manager.free(req)
+
+    # Evict the blocks outside sliding window, does not affect the hit length.
+    test_partial_request_hit("2", [
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2)
+    ], 3)
+
+    # Evict the first block of full attention, makes total cache miss.
+    test_partial_request_hit("3", [
+        BlockHashWithGroupId(block_hashes[0], 0),
+    ], 0)
+
+    # Evict the last block of all layers, reduces the hit length to 2.
+    test_partial_request_hit("4", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[2], 1),
+        BlockHashWithGroupId(block_hashes[2], 2),
+    ], 2)
+
+    # Evict the last block of full attention, reduces the hit length to 2.
+    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)],
+                             2)
+
+    # Evict different set of blocks for full attention and sliding window makes
+    # total cache miss.
+    # The cache hit length of full attention is 1 * block_size.
+    # The cache hit length of sliding window is 2 * block_size.
+    # Then it is cache miss as the two type of layers have different hit length.
+    test_partial_request_hit("8", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2),
+    ], 0)
+
+
 def test_prefill_plp():
     '''Test prefill with APC and some prompt logprobs (plp) requests.
 
@@ -203,13 +369,13 @@ def test_prefill_plp():
     req0 = make_request("0", all_token_ids, prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(manager.req_to_block_hashes[req0.request_id]) == 0
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
-    req0_block_hashes = [b.block_hash for b in blocks.blocks]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
     parent_block_hash = None
@@ -217,7 +383,8 @@ def test_prefill_plp():
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -233,14 +400,14 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
-    for block in computed_blocks.blocks:
+    assert blocks.get_block_ids() == ([5], )
+    for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
@@ -269,15 +436,15 @@ def test_prefill_plp():
                         prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 0
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
-    assert [b.block_hash for b in blocks.blocks] == req0_block_hashes
-    assert block_ids != [[1, 2, 3, 4]]
+    assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
+    assert block_ids != ([1, 2, 3, 4], )
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -302,22 +469,22 @@ def test_decode():
     unique_token_ids = [3] * 7
     req0 = make_request("0", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
     for _ in range(4):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
-    assert manager.single_type_manager.req_to_blocks[
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
@@ -327,12 +494,12 @@ def test_decode():
     for _ in range(9 + 10):
         req0.append_output_token_ids(7)
     new_blocks = manager.allocate_slots(req0, 19,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 1
-    assert manager.single_type_manager.req_to_blocks[
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 1
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-2].block_hash is not None
-    assert manager.single_type_manager.req_to_blocks[
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-1].block_hash is None
 
 
@@ -346,23 +513,23 @@ def test_evict():
     last_token_id = 5 * 16 + 7
     req0 = make_request("0", list(range(last_token_id)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 6  # 5 full + 1 partial
+    assert len(blocks.blocks[0]) == 6  # 5 full + 1 partial
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
                                         last_token_id + 3 * 16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 3 * 16,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 3  # 3 full blocks
+    assert len(blocks.blocks[0]) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
     # 10 - (6 + 3) == 1
@@ -379,12 +546,12 @@ def test_evict():
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert computed_blocks.get_block_ids() == [[1, 2]]
+    assert computed_blocks.get_block_ids() == ([1, 2], )
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[10]]
+    assert blocks.get_block_ids() == ([10], )
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
@@ -404,12 +571,12 @@ def test_hash_block_correct_reuse():
     num_tokens = block_size * 1
     req = make_request("0", list(range(num_tokens)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
+    assert len(blocks.blocks[0]) == 1
 
     # Deallocate the block.
     manager.free(req)
@@ -418,15 +585,15 @@ def test_hash_block_correct_reuse():
     # block is cleared.
     req = make_request("1", list(range(num_tokens - 1)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens - 1,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
+    assert len(blocks.blocks[0]) == 1
 
-    assert manager.block_pool.blocks[
-        blocks.blocks[0].block_id].block_hash is None
+    assert manager.block_pool.blocks[blocks.blocks[0]
+                                     [0].block_id].block_hash is None
 
 
 def test_computed_blocks_not_evicted():
@@ -445,24 +612,24 @@ def test_computed_blocks_not_evicted():
     num_tokens = block_size * 1
     req0 = make_request("0", list(range(num_tokens)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 1
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 1
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 2
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
 
     # Free the blocks.
     manager.free(req0)
@@ -472,15 +639,15 @@ def test_computed_blocks_not_evicted():
     # cached block rather than the first one.
     req2 = make_request("2", list(range(num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(computed_blocks.blocks) == 1
-    assert computed_blocks.blocks[0].block_id == 1
+    assert len(computed_blocks.blocks[0]) == 1
+    assert computed_blocks.blocks[0][0].block_id == 1
     assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 2
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
 
 
 def test_basic_prefix_caching_disabled():
@@ -497,12 +664,12 @@ def test_basic_prefix_caching_disabled():
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 10,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 3
+    assert len(blocks.blocks[0]) == 3
 
     # Free the blocks.
     manager.free(req1)
@@ -510,20 +677,20 @@ def test_basic_prefix_caching_disabled():
     # No caching.
     req2 = make_request("2", list(range(16)))  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 16,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 4
+    assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
     req3 = make_request("3", list(range(4)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 4,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert not blocks
 
@@ -548,7 +715,7 @@ def test_cache_blocks(hash_fn):
 
     # Test that blocks are cached correctly for 2 full blocks from the start.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: list[BlockHashType] = []
+    block_hashes: list[BlockHash] = []
 
     block_pool.cache_full_blocks(
         request=req,
@@ -558,6 +725,7 @@ def test_cache_blocks(hash_fn):
         num_full_blocks=2,
         block_size=block_size,
         hash_fn=hash_fn,
+        kv_cache_group_id=0,
     )
 
     assert len(block_pool.cached_block_hash_to_block) == 2
@@ -573,11 +741,83 @@ def test_cache_blocks(hash_fn):
         num_full_blocks=3,
         block_size=block_size,
         hash_fn=hash_fn,
+        kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
 
 
+def test_cache_blocks_multi_group():
+    """
+    This tests that blocks are cached correctly for different kv cache groups.
+    """
+    block_size = 4
+    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+
+    # Req:
+    #  Block 0/4: [0, 1, 2, 3]
+    #  Block 1/5: [4, 5, 6, 7]
+    #  Block 2/6: [8, 9, 10, 11]
+    #  Block 3/7: [12, 13]
+    req = make_request("0", list(range(14)))
+
+    # Cache the blocks for group 0.
+    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+    block_hashes: list[BlockHash] = []
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=2,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=0,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 2
+    assert len(block_hashes) == 2
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Cache the blocks for group 1.
+    blocks = [KVCacheBlock(block_id=i) for i in range(3)]
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=3,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=1,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 5
+    assert len(block_hashes) == 3
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Block hash 0: hit for group 0 and 1
+    # Block hash 1: hit for group 0 and 1
+    # Block hash 2: hit for group 1
+
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0]) is None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0, 1]) is None
+
+
 def test_mm_prefix_caching():
     """
     This tests that the multi-modal prefix caching is correct.
@@ -614,7 +854,7 @@ def test_mm_prefix_caching():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
@@ -623,18 +863,18 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # The just completed block should have hashes with extra keys.
     assert len(block_hashes) == 4
@@ -652,7 +892,7 @@ def test_mm_prefix_caching():
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * 16
 
 
@@ -675,7 +915,7 @@ def test_cache_key_salting():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
@@ -684,18 +924,18 @@ def test_cache_key_salting():
     assert block_hashes[2].extra_keys is None
 
     blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # Now one more block that should not have extra keys.
     assert len(block_hashes) == 4
@@ -706,14 +946,14 @@ def test_cache_key_salting():
     req1 = make_request("1", token_ids, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     # Should match only a prefix of 3 blocks.
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * block_size
 
     # Test cache miss with same content but different salt.
     token_ids = common_token_ids + [4] * 11
     req2 = make_request("2", token_ids, cache_salt="salt2")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(computed_blocks.blocks) == 0
+    assert len(computed_blocks.blocks[0]) == 0
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req2.request_id]
     assert len(block_hashes) == 3
@@ -738,20 +978,24 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     common_token_ids = [i for i in range(3) for _ in range(16)]
     req0 = make_request("0", common_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
-    block_part0 = manager.single_type_manager.req_to_blocks[req0.request_id]
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part0 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert computed_blocks.blocks == block_part0
+    assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
-    block_part1 = manager.single_type_manager.req_to_blocks[req1.request_id]
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part1 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
@@ -762,10 +1006,11 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
     req2 = make_request("2", [7] * block_size * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
@@ -773,11 +1018,11 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert manager.block_pool.free_block_queue.num_free_blocks == 5
     req3 = make_request("3", common_token_ids * 3)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert computed_blocks.blocks == block_part1
+    assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
     assert manager.allocate_slots(req3, 48,
-                                  len(computed_blocks.blocks) * 16,
+                                  len(computed_blocks.blocks[0]) * 16,
                                   computed_blocks) is None
     # Block 0-2 are used by Req 1.
     assert {block.ref_cnt for block in block_part1[:3]} == {1}
@@ -797,18 +1042,18 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
     req1 = make_request("1", all_token_ids)
     computed_blocks, _ = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     blocks = manager.allocate_slots(req1, 7,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
@@ -836,10 +1081,11 @@ def test_prefix_cache_stats_disabled():
     # Call all functions that check whether log_stats is disabled.
     req = make_request("0", list(range(16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req, 16,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.reset_prefix_cache()
 
     # Ensure prefix_cache_stats remains None
@@ -918,7 +1164,8 @@ def test_eagle_enabled_removes_last_block():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
@@ -928,7 +1175,7 @@ def test_eagle_enabled_removes_last_block():
     # Should retain 1 block:
     # 1. Original 3 blocks → pop last hash → 2 matched blocks
     # 2. drop last matched block → 1 remaining block
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size  # 16 tokens
 
 
@@ -948,14 +1195,15 @@ def test_eagle_with_partial_blocks():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.free(req)
 
     # New request with Eagle enabled
     req_eagle = make_request("partial_eagle", token_ids)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size
 
 
@@ -973,7 +1221,7 @@ def test_eagle_with_sliding_window():
     manager = KVCacheManager(
         KVCacheConfig(
             num_blocks=10,
-            tensors={},
+            kv_cache_tensors=[],
             kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
         ),
         max_model_len=8192,
@@ -988,7 +1236,8 @@ def test_eagle_with_sliding_window():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     # record the block hash of the first block in the request for later use
     block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
     assert block_hash_first_block is not None
@@ -998,13 +1247,14 @@ def test_eagle_with_sliding_window():
     req_eagle = make_request("partial_eagle", token_ids)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size
 
     # Evict the first block in the request
     assert manager.block_pool.get_cached_block(
-        block_hash_first_block) is not None
-    manager.block_pool.cached_block_hash_to_block.pop(block_hash_first_block)
+        block_hash_first_block, kv_cache_group_ids=[0]) is not None
+    manager.block_pool.cached_block_hash_to_block.pop(
+        BlockHashWithGroupId(block_hash_first_block, 0))
 
     # New request
     req_after_evict = make_request("partial_eagle_after_evict", token_ids)
@@ -1012,5 +1262,5 @@ def test_eagle_with_sliding_window():
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
     # there will be no matched prefix.
-    assert len(computed_blocks.blocks) == 0
+    assert len(computed_blocks.blocks[0]) == 0
     assert num_tokens == 0
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f40d477a00363604b1df287f6e4a368f011772b0..d348956aa17734d563e217f1c99d4bd1aa6a1425 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 from unittest.mock import Mock
 
@@ -96,7 +97,7 @@ def create_scheduler(
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
@@ -138,7 +139,6 @@ def create_requests(num_requests: int,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
-            arrival_time=0,
         )
         requests.append(request)
     return requests
@@ -744,7 +744,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
 
     # No draft or accepted tokens counted yet
-    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
+    assert not engine_core_outputs or (
+        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
 
     # Schedule the speculated tokens for validation
     output = scheduler.schedule()
@@ -772,7 +773,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
 
-    scheduler_stats = engine_core_outputs.scheduler_stats
+    scheduler_stats = engine_core_outputs[0].scheduler_stats \
+        if engine_core_outputs else None
     if expected[0] == 0:
         assert scheduler_stats.spec_decoding_stats is None
     else:
@@ -812,10 +814,10 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
-        blocks = (scheduler.kv_cache_manager.single_type_manager.
-                  req_to_blocks[req_id])
+        blocks = (scheduler.kv_cache_manager.coordinator.
+                  single_type_managers[0].req_to_blocks[req_id])
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
-        assert (scheduler.kv_cache_manager.single_type_manager.
+        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                 num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
@@ -843,7 +845,7 @@ def _step_until_done(
             # We should be in the decode phase now.
             assert num_scheduled_tokens == 1
         assert len(output.kv_connector_metadata.requests) == 0
-        ecos = scheduler.update_from_output(output, model_runner_output)
+        ecos = scheduler.update_from_output(output, model_runner_output)[0]
         all_done = True
         for eco in ecos.outputs:
             if eco.finish_reason is None:
@@ -1196,11 +1198,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 511d57d405ba252511fbadc1e96676d8abbce211..85415f6ad4b693b4b9ecd9121a3c189a8f37821f 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index 101a2379be377cdda6e3a5a1d647f40f4bc25787..a9e1898df9344aa3309e957749189adb6b22f07e 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock)
 from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
@@ -11,9 +13,8 @@ from vllm.v1.kv_cache_interface import SlidingWindowSpec
 def get_sliding_window_manager(sliding_window_spec, block_pool):
     return SlidingWindowManager(sliding_window_spec,
                                 block_pool,
-                                use_eagle=False,
-                                num_kv_cache_groups=1,
-                                caching_hash_fn=lambda x: x)
+                                caching_hash_fn=lambda x: x,
+                                kv_cache_group_id=0)
 
 
 def test_sliding_window_possible_cached_prefix():
@@ -32,7 +33,7 @@ def test_sliding_window_possible_cached_prefix():
 
     def run_one_case(block_is_cached, expect_length):
         block_hash_list = [
-            BlockHashType(i, ()) for i in range(len(block_is_cached))
+            BlockHash(i, ()) for i in range(len(block_is_cached))
         ]
 
         block_pool.cached_block_hash_to_block.clear()
@@ -41,13 +42,18 @@ def test_sliding_window_possible_cached_prefix():
         for i, (block_hash,
                 is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
             if is_cached:
-                block_pool.cached_block_hash_to_block[block_hash] = {
-                    i: block_pool.blocks[i + 10]
-                }
+                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
+                    block_hash, 0)] = {
+                        i: block_pool.blocks[i + 10],
+                    }
 
         computed_blocks = manager.find_longest_cache_hit(
-            block_hash_list,
-            len(block_hash_list) * block_size)
+            block_hashes=block_hash_list,
+            max_length=len(block_hash_list) * block_size,
+            kv_cache_group_ids=[0],
+            block_pool=block_pool,
+            kv_cache_spec=sliding_window_spec,
+            use_eagle=False)[0]
         assert len(computed_blocks) == expect_length
 
         assert all(block == block_pool.null_block
@@ -94,13 +100,13 @@ def test_sliding_window_remove_skipped_blocks():
 
     null_block_id = block_pool.null_block.block_id
 
-    def id_to_block_table(ids):
+    def id_to_block_table(ids) -> list[KVCacheBlock]:
         return [
             KVCacheBlock(id_)
             if id_ != null_block_id else block_pool.null_block for id_ in ids
         ]
 
-    def assert_block_id(block_table, ids):
+    def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
         for block, id_ in zip(block_table, ids):
             if id_ == null_block_id:
                 assert block == block_pool.null_block
@@ -143,3 +149,26 @@ def test_sliding_window_remove_skipped_blocks():
     # of removed blocks should be [1003, 1002].
     manager.remove_skipped_blocks("test", 11)
     assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
+
+
+def test_get_num_blocks_to_allocate():
+    block_size = 2
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,  # Placeholder value, not related to test result
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)
+                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+
+    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
+                                              cached_blocks_1) == 20
+    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
+                                              cached_blocks_2) == 15
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 48c265560348c80e540d71c1e05f3d9906b398a9..161bcd4d3ef9d5fd9776fd8ad09f742cc9a4fc4d 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index a125d3fb79750ac352e5a5d72c92b98ec1747c83..d8882b1d94324d2061ebf23b74e76202177235bf 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import pytest
@@ -17,7 +18,7 @@ class TestConfig:
 
 model_config = {
     "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
-    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
 }
 
 
@@ -25,7 +26,7 @@ model_config = {
     "model",
     [
         "bigcode/starcoder2-3b",  # sliding window only
-        "google/gemma-2-2b-it",  # sliding window + full attention
+        "google/gemma-3-1b-it",  # sliding window + full attention
     ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 2fad37d6801bb5d731a1afd9ae13a949e9aa88bc..93e7c12f3a091cfcfd217e526108624dac725572 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import random
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index d04679c12448abb4be0902ba2c02f790c53cd539..d7722142b207f85d28ec1ab7f48ae83ddba23f8f 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 5d52ad5f53280e7e47275a3d3953d2be995c87ea..7dff937c0fd9f7dc565eab262d67857bc6bc1236 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from contextlib import ExitStack
@@ -21,9 +22,11 @@ if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
 
-TEXT_ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
-                                   enforce_eager=True,
-                                   disable_log_requests=True)
+TEXT_ENGINE_ARGS = AsyncEngineArgs(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    disable_log_requests=True,
+)
 
 VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
                                      enforce_eager=True,
@@ -40,28 +43,33 @@ VISION_PROMPT = {
     "prompt": VISION_PROMPT_TEMPLATE,
     "multi_modal_data": {
         "image": ImageAsset("stop_sign").pil_image
-    }
+    },
 }
 
 
-async def generate(engine: AsyncLLM,
-                   request_id: str,
-                   prompt: PromptType,
-                   output_kind: RequestOutputKind,
-                   max_tokens: int,
-                   n: int = 1,
-                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+async def generate(
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    output_kind: RequestOutputKind,
+    max_tokens: int,
+    n: int = 1,
+    prompt_logprobs: Optional[int] = None,
+    cancel_after: Optional[int] = None,
+) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
     count = 0
-    sampling_params = SamplingParams(max_tokens=max_tokens,
-                                     ignore_eos=True,
-                                     output_kind=output_kind,
-                                     temperature=0.5,
-                                     seed=33,
-                                     n=n,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        ignore_eos=True,
+        output_kind=output_kind,
+        temperature=0.5,
+        seed=33,
+        n=n,
+        prompt_logprobs=prompt_logprobs,
+    )
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
                                      sampling_params=sampling_params):
@@ -72,20 +80,27 @@ async def generate(engine: AsyncLLM,
         else:
             count = num_tokens
 
-        await asyncio.sleep(0.)
+        if cancel_after is not None and count >= cancel_after:
+            return count, request_id
+
+        await asyncio.sleep(0.0)
 
     return count, request_id
 
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_load(monkeypatch: pytest.MonkeyPatch,
-                    output_kind: RequestOutputKind,
-                    engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
@@ -124,13 +139,17 @@ async def test_load(monkeypatch: pytest.MonkeyPatch,
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_abort(monkeypatch: pytest.MonkeyPatch,
-                     output_kind: RequestOutputKind,
-                     engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_abort(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -149,8 +168,9 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
         for idx, request_id in enumerate(request_ids):
-            max_tokens = NUM_EXPECTED_TOKENS_LONG if (
-                idx in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS
+            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
+                          (idx
+                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
             n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
@@ -191,12 +211,17 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
 
 
 @pytest.mark.parametrize("n", [1, 3])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
-                             engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_finished_flag(
+    monkeypatch: pytest.MonkeyPatch,
+    n: int,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -204,11 +229,13 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
-        sampling_params = SamplingParams(max_tokens=100,
-                                         output_kind=RequestOutputKind.DELTA,
-                                         temperature=1.0,
-                                         seed=33,
-                                         n=n)
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+            n=n,
+        )
         outputs = [
             out
             async for out in engine.generate(request_id="request-33",
@@ -221,6 +248,63 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         assert outputs[-1].finished
 
 
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
+                                       engine_args: AsyncEngineArgs,
+                                       prompt: PromptType):
+    """Test that requests can be cancelled mid-stream."""
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_TOKENS = 1000
+        NUM_EXPECTED_TOKENS = 20
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests that will be cancelled mid-stream
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(
+                        engine,
+                        request_id,
+                        prompt,
+                        RequestOutputKind.DELTA,
+                        NUM_TOKENS,
+                        cancel_after=NUM_EXPECTED_TOKENS,
+                    )))
+
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks)
+
+        # Verify all tasks were cancelled at the expected point
+        for num_generated_tokens, request_id in results:
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} tokens but "
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+
+        # Make sure no requests are left hanging
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can reuse the request id after the cancellations.
+        request_id = request_ids[0]
+        task = asyncio.create_task(
+            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
+                     NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
+
+
 class MockLoggingStatLogger(LoggingStatLogger):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
@@ -249,3 +333,32 @@ async def test_customize_loggers(monkeypatch):
         assert len(engine.stat_loggers) == 1
         assert len(engine.stat_loggers[0]) == 1
         engine.stat_loggers[0][0].log.assert_called_once()
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        sampling_params = SamplingParams(max_tokens=100,
+                                         output_kind=RequestOutputKind.DELTA,
+                                         temperature=1.0,
+                                         seed=33)
+
+        # Test with valid DP rank.
+        async for _ in engine.generate(request_id="request-34",
+                                       prompt=TEXT_PROMPT,
+                                       sampling_params=sampling_params,
+                                       data_parallel_rank=0):
+            pass
+
+        # Test with out-of-range DP rank.
+        with pytest.raises(ValueError):
+            async for _ in engine.generate(request_id="request-35",
+                                           prompt=TEXT_PROMPT,
+                                           sampling_params=sampling_params,
+                                           data_parallel_rank=1):
+                pass
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 9b2f1a919931941176cb61991cfd98c3b2d2cf73..f70a3ce147ff2638e26ba8b10a83756cad0bd3f9 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import ArgumentError
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index dcf494825b0d45a10c667aed0070b67f8726c720..1cbbf30371afd561dfdbb100065b4bd69622b92c 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import time
@@ -41,6 +42,7 @@ def make_request() -> EngineCoreRequest:
         arrival_time=time.time(),
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
     )
 
 
@@ -88,7 +90,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 4
 
         # Loop through until they are all done.
-        while len(engine_core.step().outputs) > 0:
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -163,11 +165,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         req0.request_id = req1.request_id = "test"
         engine_core.add_request(req0)
 
-        while len(engine_core.step().outputs) > 0:
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
         engine_core.add_request(req1)
-        while len(engine_core.step().outputs) > 0:
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -207,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
             assert len(engine_core.scheduler.waiting) == 1
             assert len(engine_core.scheduler.running) == 0
             # Loop through until they are all done.
-            while len(engine_core.step().outputs) > 0:
+            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
                 pass
             assert len(engine_core.scheduler.waiting) == 0
             assert len(engine_core.scheduler.running) == 0
@@ -296,7 +298,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         engine_core.add_request(req1)
 
         # Schedule Batch 1: (10, req0)
-        assert engine_core.step_with_batch_queue() is None
+        assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 1
         scheduler_output = engine_core.batch_queue.queue[-1][1]
         assert scheduler_output.num_scheduled_tokens[0] == 10
@@ -305,7 +307,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             req0.request_id].num_computed_tokens == 10
 
         # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue() is None
+        assert engine_core.step_with_batch_queue()[0] is None
         assert engine_core.batch_queue.qsize() == 2
         scheduler_output = engine_core.batch_queue.queue[-1][1]
         assert scheduler_output.num_scheduled_tokens[0] == 2
@@ -327,7 +329,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert scheduler_output.num_scheduled_tokens[1] == 4
 
         # Batch queue is full. Finish Batch 2. Get first token of req0.
-        output = engine_core.step_with_batch_queue()
+        output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
@@ -339,7 +341,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         assert scheduler_output.num_scheduled_tokens[0] == 1
 
         # Batch queue is full. Finish Batch 3. Get first token of req1.
-        output = engine_core.step_with_batch_queue()
+        output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
@@ -358,11 +360,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             engine_core.scheduler.requests[1].num_tokens + 1,
         ]
         while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            output = engine_core.step_with_batch_queue()
+            output = engine_core.step_with_batch_queue()[0]
             if step % 2 == 0:
                 # Even steps consumes an output.
                 assert output is not None
-                assert len(output.outputs) == 1
+                assert len(output[0].outputs) == 1
                 if req_id in engine_core.scheduler.requests:
                     assert engine_core.scheduler.requests[
                         req_id].num_tokens == expected_num_tokens[req_id]
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 8bea032f656fc9406500313cc1a672901221d1b9..c2dc3b4731b5ae10c4600820d11c659da98f0110 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
@@ -11,8 +12,10 @@ from typing import Optional
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import multi_gpu_test
 from vllm import SamplingParams
-from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
+                                        ZmqEventPublisher)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
@@ -36,10 +39,15 @@ PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
-def make_request(params: SamplingParams) -> EngineCoreRequest:
+def make_request(
+        params: SamplingParams,
+        prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest:
+    if not prompt_tokens_ids:
+        prompt_tokens_ids = PROMPT_TOKENS
+
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
-        prompt_token_ids=PROMPT_TOKENS,
+        prompt_token_ids=prompt_tokens_ids,
         mm_inputs=None,
         mm_hashes=None,
         mm_placeholders=None,
@@ -48,6 +56,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         arrival_time=time.time(),
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
     )
 
 
@@ -87,6 +96,25 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
             break
 
 
+async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
+
+    while True:
+        engine_core_outputs = (await client.get_output_async()).outputs
+
+        if len(engine_core_outputs) == 0:
+            continue
+
+        # Add outputs to the dict
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+
+        # Check if all request IDs in outputs have finished
+        if all(outs and outs[-1].finished for outs in outputs.values()):
+            break
+
+        await asyncio.sleep(0.1)
+
+
 # Dummy utility function to monkey-patch into engine core.
 def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
@@ -272,10 +300,12 @@ def test_kv_cache_events(
         block_size = 16
         num_blocks = 2
 
-        engine_args = EngineArgs(model=MODEL_NAME,
-                                 enforce_eager=True,
-                                 enable_prefix_caching=True,
-                                 block_size=block_size)
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            block_size=block_size,
+        )
         engine_args.kv_events_config = publisher_config
 
         vllm_config = engine_args.create_engine_config(
@@ -296,19 +326,8 @@ def test_kv_cache_events(
 
         try:
             custom_tokens = list(range(num_blocks * block_size))
-            request = EngineCoreRequest(
-                request_id=str(uuid.uuid4()),
-                prompt_token_ids=custom_tokens,
-                mm_inputs=None,
-                mm_hashes=None,
-                mm_placeholders=None,
-                sampling_params=SamplingParams(
-                    max_tokens=1),  # Short completion for speed
-                eos_token_id=None,
-                arrival_time=time.time(),
-                lora_request=None,
-                cache_salt=None,
-            )
+            sampling_params = SamplingParams(max_tokens=1)
+            request = make_request(sampling_params, custom_tokens)
             client.add_request(request)
 
             outputs: dict[str, list] = {request.request_id: []}
@@ -320,24 +339,130 @@ def test_kv_cache_events(
             seq, received = result
 
             assert seq == 0, "Sequence number mismatch"
-            assert len(received.events) == 1, (
-                "We should have exactly one BlockStored event")
+            assert (len(received.events) == 1
+                    ), "We should have exactly one BlockStored event"
             event = received.events[0]
             assert isinstance(
-                event, BlockStored), ("We should have a BlockStored event")
-            assert len(event.block_hashes) == num_blocks, (
-                "We should have a BlockStored event with 2 block_hashes")
-            assert event.block_size == block_size, (
-                "Block size should be the same as the block size")
-            assert event.parent_block_hash is None, (
-                "Parent block hash should be None")
+                event, BlockStored), "We should have a BlockStored event"
+            assert (len(event.block_hashes) == num_blocks
+                    ), "We should have a BlockStored event with 2 block_hashes"
+            assert (event.block_size == block_size
+                    ), "Block size should be the same as the block size"
+            assert (event.parent_block_hash
+                    is None), "Parent block hash should be None"
             assert event.lora_id is None, "Lora id should be None"
-            assert len(event.token_ids) == num_blocks * block_size, (
-                "Token ids should be the same as the custom tokens")
-            assert event.token_ids == custom_tokens, (
-                "Token ids should be the same as the custom tokens")
+            assert (len(event.token_ids) == num_blocks * block_size
+                    ), "Token ids should be the same as the custom tokens"
+            assert (event.token_ids == custom_tokens
+                    ), "Token ids should be the same as the custom tokens"
+        finally:
+            client.shutdown()
+            subscriber.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp")],
+    indirect=["publisher_config"],
+)
+@multi_gpu_test(num_gpus=4)
+async def test_kv_cache_events_dp(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
+        dp_size = 2
+        tp_size = 2
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            data_parallel_size=dp_size,
+            tensor_parallel_size=tp_size,
+            block_size=block_size,
+        )
+        engine_args.kv_events_config = publisher_config
+
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
+
+        executor_class = Executor.get_class(vllm_config)
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+        )
+        await asyncio.sleep(1)
+
+        # Build endpoints for all DP ranks
+        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        endpoints = []
+        for i in range(dp_size):
+            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(
+                base_endpoint, i)
+            endpoints.append(offset_endpoint)
+
+        subscriber = MockSubscriber(endpoints,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            sampling_params = SamplingParams(max_tokens=1)
+            all_request_ids = []
+
+            # Create and add 25 requests
+            # NOTE: attempts to force routing to both dp groups but can be flaky
+            for i in range(25):
+                await asyncio.sleep(0.01)
+                request = make_request(sampling_params, custom_tokens)
+                await client.add_request_async(request)
+                all_request_ids.append(request.request_id)
+
+            await asyncio.sleep(0.1)
+
+            # Initialize outputs dict for all requests
+            outputs: dict[str, list] = {
+                req_id: []
+                for req_id in all_request_ids
+            }
+
+            print("processing requests...")
+            await asyncio.wait_for(loop_until_fully_done_async(
+                client, outputs),
+                                   timeout=20.0)
+
+            # Receive from subscriber until no more messages
+            print("collecting results...")
+            results = []
+            while True:
+                result = subscriber.receive_one(timeout=1)
+                print(result)
+                if result is None:
+                    break
+                results.append(result)
+
+            # Collect all events and data_parallel_ranks from all results
+            all_dp_ranks = [
+                received.data_parallel_rank for (_, received) in results
+            ]
+            unique_dps = set(all_dp_ranks)
+            assert (
+                len(unique_dps) == 2
+            ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+
         finally:
             client.shutdown()
+            subscriber.close()
 
 
 @pytest.mark.timeout(20)
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index e77916f9582339b17b0f93f81d964d269a92abbe..6284dcfb915bc97740bf01c2a3d60fca9f399a4b 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index fac701c4ca35b675043e51614f711db01936e9f2..6b88b0cf17e32ccc92d0c6b2fad8002addf4a845 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import time
@@ -58,6 +59,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
                           eos_token_id=None,
                           lora_request=None,
                           cache_salt=None,
+                          data_parallel_rank=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -405,6 +407,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                           eos_token_id=None,
                           lora_request=None,
                           cache_salt=None,
+                          data_parallel_rank=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -568,6 +571,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         eos_token_id=eos_token_id,
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
         sampling_params=SamplingParams(
             skip_special_tokens=False,
             spaces_between_special_tokens=False,
@@ -665,6 +669,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             eos_token_id=None,
             lora_request=None,
             cache_salt=None,
+            data_parallel_rank=None,
             sampling_params=SamplingParams(
                 skip_special_tokens=False,
                 spaces_between_special_tokens=False,
@@ -779,6 +784,7 @@ def test_iteration_stats(dummy_test_vectors):
             eos_token_id=None,
             lora_request=None,
             cache_salt=None,
+            data_parallel_rank=None,
             sampling_params=SamplingParams(),
         ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 4a23e0c1b212e91e8e8604a8c5c625b0d028af34..b58bc75fc95651f0fde0f1db6dd700ad90d00d90 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from dataclasses import dataclass
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 8c03f04330dd5ef8567e806c16b30d020ac355c8..ffe0612124660cd038b9d6d1db7a715c2395aebe 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 5f1fff200de31592776ba7a4a8443ca502af6e4a..a39ab47b8d8704c2ee86ac6e7fe0c33a5aa15b70 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -1,5 +1,6 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index c650ccd0ccd7d7c135ba5bcf9bff1d462a990cbc..dffb32846c05e38cbadc2085e0ee3a611cac0f2f 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 333ad23795f34ba0bf5212dc1926d169330aaa7d..a7c31c0642244e0121b92c9b7ba30265f5d4fd51 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed4ecbe8484c107db21083c795507ca796d0ecc2
--- /dev/null
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+DP_SIZE = os.getenv("DP_SIZE", "1")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--api-server-count",
+        "4",
+        "--data_parallel_size",
+        DP_SIZE,
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(client: openai.AsyncOpenAI,
+                                 model_name: str) -> None:
+
+    async def make_request():
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request
+    result = await make_request()
+    assert result is not None
+
+    await asyncio.sleep(0.5)
+
+    # Send two bursts of requests
+    num_requests = 100
+    tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    tasks = [make_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+    assert len(results) == num_requests
+    assert all(completion is not None for completion in results)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request():
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single request
+    result = await make_streaming_request()
+    assert result is not None
+
+    await asyncio.sleep(0.5)
+
+    # Send two bursts of requests
+    num_requests = 100
+    tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+
+    assert len(
+        results
+    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    tasks = [make_streaming_request() for _ in range(num_requests)]
+    results = await asyncio.gather(*tasks)
+
+    assert len(
+        results
+    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert all(results), "Not all streaming requests completed successfully."
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index c17784e0a263e26cc8b49f11402324c19efca52d..b48655d80eefdbc927faef2a81ddb47756e20c2c 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -8,7 +8,9 @@ MODELS=(
 
 # Number of prefill and decode instances to create
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
-NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2}   # Default to 2
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -74,9 +76,10 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
     GPU_ID=$((i % $(get_num_gpus)))
+
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
-    # Calculate side channel port
+    # Calculate side channel port. Avoid clash with with TP workers. 
     SIDE_CHANNEL_PORT=$((5559 + i))
 
     echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
@@ -87,6 +90,7 @@ run_tests_for_model() {
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -109,7 +113,7 @@ run_tests_for_model() {
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5659 + i))
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
 
     echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
 
@@ -119,6 +123,7 @@ run_tests_for_model() {
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index be2d84f3bb1718210afe927707395e7b01117805..e5d66ffeeeb2395b4119dda05b0454bab2254210 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import lm_eval
@@ -13,6 +14,7 @@ RTOL = 0.03
 # Model-specific expected values
 EXPECTED_VALUES = {
     "Qwen/Qwen3-0.6B": 0.41,
+    "deepseek-ai/deepseek-vl2-small": 0.59
 }
 
 SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",  # noqa: E501
diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
index 5363fbde0096251142c045a44f89f5c23284d435..95465a25fc9d21f6ea3a4ec805b0468f8ecc1bc3 100644
--- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import openai
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 13071f581375cd008396bf96a978f9357010fe15..3d720fe0cafeecbcf94f44cd54e831a6e73b6355 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import itertools
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index a21d92c52244d3f9d7b4b31651f05c0860922d42..72848c1a706e78f870135d5eaad3ef61cac45a11 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import filecmp
 import shutil
 import tempfile
@@ -11,6 +12,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector)
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -31,7 +33,7 @@ class TestSharedStorageConnector(SharedStorageConnector):
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
         self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}_events.log"
+        ) + f"/connector_{self.name}-{self.role.name}_events.log"
         # Start with an empty file
         with open(self._event_file, "w") as _:
             pass
@@ -51,10 +53,20 @@ class TestSharedStorageConnector(SharedStorageConnector):
 
             def wrapper(*args, **kwargs):
                 self.call_record[name] += 1
+
+                # Include args that we're interested in
+                to_log = [name]
+                for arg in args:
+                    if isinstance(arg, int):
+                        to_log.append(str(arg))
+                    elif isinstance(arg, KVCacheBlocks):
+                        to_log.append(
+                            f"num_blocks={[len(b) for b in arg.blocks]}")
+
                 # Log the event as a line to the file
                 try:
                     with open(self._event_file, "a") as f:
-                        f.write(name + "\n")
+                        f.write(' '.join(to_log) + "\n")
                 except Exception as e:
                     print(f"[ERROR] Could not log event {name} "
                           f"for {self.name}: {e}")
@@ -161,15 +173,23 @@ def test_multi_shared_storage_connector_consistency():
              f"{storage_1_path} and {storage_2_path}")
 
     events = get_connector_events()
-    # get_num_new_matched_tokens will be called on each connector in turn.
-    # neither of them have hits so update_state_after_alloc won't be called.
-    assert events["storage1"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    # get_num_new_matched_tokens and update_state_after_alloc will be called
+    # on each connector in turn.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+    ]
+    assert events["storage1-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
+    ]
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
-    assert events["storage2"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    assert events["storage2-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
     ]
 
     # Reset prefix cache or else we'll just get the tokens back from there.
@@ -181,16 +201,16 @@ def test_multi_shared_storage_connector_consistency():
 
     events = get_connector_events()
     # get_num_new_matched_tokens will return new tokens from the first
-    # connector so update_state_after_alloc will be called once blocks
-    # are allocated for the first connector.
-    # get_num_new_matched_tokens *won't* be called on the second connector
-    # in this case.
-    assert events["storage1"][:4] == [
-        'get_num_new_matched_tokens', 'update_state_after_alloc',
-        'build_connector_meta', 'bind_connector_metadata'
+    # connector so update_state_after_alloc will be with allocated blocks
+    # on that one but with zero blocks for others (first nonzero match is
+    # chosen).
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
     ]
-    assert events["storage2"][:2] == [
-        'build_connector_meta', 'bind_connector_metadata'
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
 
     # Delete storage1 connector state
@@ -204,17 +224,17 @@ def test_multi_shared_storage_connector_consistency():
     _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
 
     events = get_connector_events()
-    # get_num_new_matched_tokens will be called for the first connector but it
-    # won't have a hit so update_state_after_alloc won't be called.
-    # get_num_new_matched_tokens will also be called on the second connector,
-    # but it should have a hit so update_state_after_alloc will be called.
-    assert events["storage1"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    # get_num_new_matched_tokens will be called for both connectors but will
+    # return 0 from the first connector, but the second connector should have
+    # a hit, so update_state_after_alloc will only be called with allocated
+    # blocks for the second connector.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
-    assert events["storage2"][:4] == [
-        'get_num_new_matched_tokens', 'update_state_after_alloc',
-        'build_connector_meta', 'bind_connector_metadata'
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
     ]
 
     # Clean up
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 9b2a720c11c46e6756cff2d967ff3baaed5c3e17..622ab6f35db339c458a1674265fbd0fcb8b4de85 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlConnectorMetadata)
@@ -35,8 +36,8 @@ def test_basic_inferface():
     req_meta = kv_connector_metadata.requests[request_id]
 
     for block_id, block in zip(
-            req_meta.local_block_ids, scheduler.kv_cache_manager.
-            single_type_manager.req_to_blocks[request_id]):
+            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
+            single_type_managers[0].req_to_blocks[request_id]):
         assert block_id == block.block_id
 
 
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 77098140343a0a75953cf0f6e67bbf263b21bcd8..ff36a281c413d0dfe416405da91717913f2434e3 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
@@ -43,7 +44,7 @@ def test_basic_lifecycle():
     # Ensure the request is finished after 1 tokens.
     assert request.is_finished()
     assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
-    output = engine_core_outputs.outputs[0]
+    output = engine_core_outputs[0].outputs[0]
     assert output.finish_reason == FinishReason.LENGTH
     assert output.kv_transfer_params is not None
 
@@ -53,8 +54,8 @@ def test_basic_lifecycle():
     assert len(scheduler.waiting) == 0
 
     # ... but blocks should not be freed.
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
 
@@ -165,7 +166,7 @@ def test_prefix_cache_lifecycle():
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_remote])
     eco = scheduler.update_from_output(scheduler_output, model_runner_output)
-    kv_transfer_params = eco.outputs[0].kv_transfer_params
+    kv_transfer_params = eco[0].outputs[0].kv_transfer_params
 
     # Ensure we send all block ids, even if there is a cache hit.
     assert (len(
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 6fcff0d6204520e2b6b2f60014f5764037644af8..a1156306dc4bfb8270f0903ba6b35fed9c9a409c 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
@@ -50,8 +51,8 @@ def test_basic_lifecycle():
     assert (block_pool.free_block_queue.num_free_blocks
             < START_FREE_BLOCK_QUEUE_SIZE)
     assert len(block_pool.cached_block_hash_to_block) == 0
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block._block_hash is None
 
@@ -61,7 +62,7 @@ def test_basic_lifecycle():
     # (1c): update_from_output()
     engine_core_outputs = scheduler.update_from_output(scheduler_output,
                                                        model_runner_output)
-    assert len(engine_core_outputs.outputs) == 0
+    assert not engine_core_outputs or not engine_core_outputs[0].outputs
 
     # STEP (2):
     # (2a): schedule(): nothing happens!
@@ -86,8 +87,8 @@ def test_basic_lifecycle():
 
     # Confirm the block are actually allocated.
     num_hashed_blocks = 0
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
         num_hashed_blocks += (1 if block._block_hash is not None else 0)
@@ -112,7 +113,7 @@ def test_basic_lifecycle():
                                                        model_runner_output)
     scheduler.schedule()
 
-    outputs = engine_core_outputs.outputs
+    outputs = engine_core_outputs[0].outputs
     assert len(outputs) == 1
     output = outputs[0]
     assert output.finish_reason == FinishReason.STOP
@@ -260,10 +261,10 @@ def test_no_spurious_prefix_caching():
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
-    local_blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_local.request_id]
-    remote_blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[  # noqa: E501
-        request_remote.request_id]
+    local_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_local.request_id]
+    remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_remote.request_id]
 
     # Local should have cached blocks (but not all due to preallocate).
     num_hashed_blocks = 0
@@ -299,8 +300,8 @@ def test_full_block_prompt():
     # STEP (1): Initialize a recv.
     scheduler_output = scheduler.schedule()
     # All blocks should be allocated.
-    num_blocks = len(scheduler.kv_cache_manager.single_type_manager.
-                     req_to_blocks[request_id])
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -318,8 +319,8 @@ def test_full_block_prompt():
 
     # We need to recompute the final token of the prompt to generate
     # the first new token, so we should not have a new block.
-    num_blocks = len(scheduler.kv_cache_manager.single_type_manager.
-                     req_to_blocks[request_id])
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
             NUM_TOKENS - 1)
@@ -335,7 +336,7 @@ def test_full_block_prompt():
                                                        model_runner_output)
     scheduler.schedule()
 
-    outputs = engine_core_outputs.outputs
+    outputs = engine_core_outputs[0].outputs
     assert len(outputs) == 1
     output = outputs[0]
     assert output.finish_reason == FinishReason.STOP
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 53e2d6fda1aeabe197bb900ec1f982c394d78388..4a9e3a7ad807a2f2b035a0b42e0a7c4b6021556b 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
@@ -31,11 +32,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
@@ -95,7 +96,7 @@ def create_scheduler(
     block_size = vllm_config.cache_config.block_size
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
@@ -153,7 +154,6 @@ def create_request(
         multi_modal_placeholders=None,
         multi_modal_hashes=None,
         eos_token_id=EOS_TOKEN_ID,
-        arrival_time=0,
     )
     req.kv_transfer_params = kv_transfer_params
     return req
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index 02475f7c150b85456f97eb859a674f32317fb108..0898ae65e7cd378b4bc56a0bb0b014e9fe7868d9 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import ray
 
@@ -46,12 +47,15 @@ def test_engine_log_metrics_ray(
                 engine_args, stat_loggers=[RayPrometheusStatLogger])
 
             for i, prompt in enumerate(example_prompts):
-                engine.generate(
+                results = engine.generate(
                     request_id=f"request-id-{i}",
                     prompt=prompt,
                     sampling_params=SamplingParams(max_tokens=max_tokens),
                 )
 
+                async for _ in results:
+                    pass
+
     # Create the actor and call the async method
     actor = EngineTestActor.remote()  # type: ignore[attr-defined]
     ray.get(actor.run.remote())
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 3800cb392fbad14fe9a830357cb40c73e997728e..69180e6e5db494156318effcdd5b94a07138510d 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Generator
@@ -41,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
             #TODO: enable this once we support it for
             # prompt logprobs.
             enable_prefix_caching=request.param,
-            gpu_memory_utilization=0.5,
+            gpu_memory_utilization=0.4,  # up to 2 alive concurrently
     ) as vllm_model:
         yield vllm_model
 
@@ -342,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        runner = VllmRunner("facebook/opt-125m",
-                            max_logprobs=1,
-                            enable_prefix_caching=False,
-                            max_model_len=256)
+        runner = VllmRunner(
+            "facebook/opt-125m",
+            max_logprobs=1,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.15,
+            max_model_len=256)
         vllm_sampling_params = SamplingParams(logprobs=1)
         # should pass
         runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index f62770060160e4336639a4eb31a3b7c231fdb558..085b2ee09743cdf01a64ccea201727390ee04927 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import lm_eval
 
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index cbdb0b910d1dc2b76d0d085fc8c6385992b46a42..f35c3e194fa717f9bdc11de206402fdb7bf6a337 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import pytest
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 24b759bc1fa60eabb5921da0ff8fd8c3663a6746..a2beb5ad71dbbe66a48991f71b413fe6e75f6874 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 0512a1e026603860d816cf901f18ba3257ed447a..ac0f3eb58836f9947dee650074575abda638183c 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index 220f05c7ff1c3d0c77d594ca105e86a83fbcfd29..63fdeb5a6de8483ec07656953f935e9168a0644b 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 932b652aea32b8626d065110941ce863f3d93edb..8c111f846b47e15207cc4abfabe35c3dac21af7b 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
 from typing import Optional
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index ed368fe828d07181f115f19db101d33f4b95356f..682d84dc23d12fda6e3f208b2b44444840787ec6 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 9fedbe4f9a01a6396edb53510972f1b8ac17e681..523b7ee231151b05dd960c51514bd6b5f2c752f4 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle an Error in model forward and shutdown."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 0fe48da475c6a49142690541b2549e43cb711f10..a077d48fecbba2bad4a1f1813beabf557e590199 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test error handling in Processor. Should not impact other reqs."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 1bba19102ec611f3bbd8d4c392974e1e1d3a5bde..88fc5297aaf5054fb6a926568d5ef6cda59114eb 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
index 8f7c0380d407f12620fd01b066f05e945fce7d0b..124254a413377dbb676d41ad97d316ac1412c6fa 100644
--- a/tests/v1/shutdown/utils.py
+++ b/tests/v1/shutdown/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Shutdown test utils"""
 
 SHUTDOWN_TEST_TIMEOUT_SEC = 120
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index b49ac45f3129b220839abff61064485543a1369a..c93b7f57c0410379c0e34baa779931ce2a119362 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest import mock
 
@@ -8,6 +9,7 @@ import torch
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VllmConfig)
+from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.v1.spec_decode.eagle import EagleProposer
 
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
@@ -112,21 +114,26 @@ def test_prepare_inputs():
     assert torch.equal(token_indices, expected_token_indices)
 
 
-@pytest.mark.parametrize(
-    "method,proposer_helper,draft_model_dir,target_attribute_path", [
-        ("eagle", lambda k: _create_proposer("eagle", k), eagle_dir,
-         ('lm_head', )),
-        ("eagle3", lambda k: _create_proposer("eagle3", k), eagle3_dir,
-         ('model', 'embed_tokens')),
-    ])
+@pytest.mark.parametrize("method,proposer_helper", [
+    ("eagle", lambda k: _create_proposer("eagle", k)),
+    ("eagle3", lambda k: _create_proposer("eagle3", k)),
+])
+@pytest.mark.parametrize("pp_size", [1, 2])
+@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
 @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
 @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
 @mock.patch('vllm.v1.spec_decode.eagle.get_model')
 def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    proposer_helper, draft_model_dir, target_attribute_path):
-
-    # Setup model mock
+                    proposer_helper, pp_size, use_distinct_embed_tokens):
+    # Setup draft model mock
     mock_model = mock.MagicMock()
+    if use_distinct_embed_tokens:
+        # Some models can have a different hidden size than the target model,
+        # so we test that their embed_tokens doesn't get overwritten
+        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
+    else:
+        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+
     mock_get_model.return_value = mock_model
 
     # Setup mocks for attention layers
@@ -144,22 +151,24 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
 
     # Setup mock for pp group to return the appropriate value for world size
     mock_pp_group = mock.MagicMock()
-    mock_pp_group.world_size = 2 if method == "eagle" else 1
+    mock_pp_group.world_size = pp_size
     mock_get_pp_group.return_value = mock_pp_group
 
-    # Setup target model with the appropriate attributes
-    target_model = mock.MagicMock()
+    # Setup the target model mock with a custom class so that
+    # isinstance() checks match the expected type.
+    class _TargetModelStub(LlamaForCausalLM):
+        model: mock.MagicMock
+        lm_head: mock.MagicMock
 
-    # Create the necessary attributes on the target model
-    current_obj = target_model
-    for i, attr in enumerate(target_attribute_path):
-        if i == len(target_attribute_path) - 1:
-            # Set the last attribute in the path to a MagicMock
-            setattr(current_obj, attr, mock.MagicMock())
-        else:
-            # Create intermediate objects if needed
-            setattr(current_obj, attr, mock.MagicMock())
-            current_obj = getattr(current_obj, attr)
+    target_model = mock.create_autospec(_TargetModelStub, instance=True)
+    target_model.model = mock.MagicMock()
+    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+
+    from vllm.model_executor.models import SupportsMultiModal
+    assert not isinstance(target_model, SupportsMultiModal)
+
+    if method == "eagle":
+        target_model.lm_head = mock.MagicMock()
 
     # Create proposer using the helper function
     proposer = proposer_helper(k=8)
@@ -170,10 +179,18 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify the specific attribute sharing based on the method
+    # Verify that EAGLE models gain the lm head from the target model
     if method == "eagle":
         assert proposer.model.lm_head == target_model.lm_head
+
+    # Verify that the embed tokens are set correctly
+    # If pp_size is > 1, the embed tokens should be distinct
+    if pp_size > 1 or use_distinct_embed_tokens:
+        assert proposer.model.model.embed_tokens != \
+            target_model.model.embed_tokens
     else:
+        # When pp_size is 1 and the draft and target models have
+        # embed_tokens of the same shape, they should be shared.
         assert proposer.model.model.embed_tokens == \
             target_model.model.embed_tokens
 
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index f577fb4ab3295991b7d52607ad66074b63f09574..9070d2b10f8b59c794aa8b53d7d51e56b53d99a7 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test whether spec decoding handles the max model length properly."""
 
 import pytest
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 50548219fff042b26f79a98df6e602314e69acef..ffea86d0d19ca92223377d3d2d5f00a8f8d7d1c8 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index ffc0bceeee49430dd0e67020ba303c23cd5a53c9..4e7c4b33e8c47b846c34307e1999ed10165a01f3 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index ce4c4d198db580606f1d03e956eafbe87fdc20d3..075ceb257ab70793d5696b1e293562a59ca8b048 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
@@ -28,12 +29,14 @@ if not current_platform.supports_v1(engine_args.create_model_config()):
                 allow_module_level=True)
 
 
-async def generate(engine: AsyncLLM,
-                   request_id: str,
-                   prompt: PromptType,
-                   output_kind: RequestOutputKind,
-                   max_tokens: int,
-                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+async def generate(
+        engine: AsyncLLM,
+        request_id: str,
+        prompt: PromptType,
+        output_kind: RequestOutputKind,
+        max_tokens: int,
+        prompt_logprobs: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
@@ -45,7 +48,8 @@ async def generate(engine: AsyncLLM,
                                      prompt_logprobs=prompt_logprobs)
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
-                                     sampling_params=sampling_params):
+                                     sampling_params=sampling_params,
+                                     data_parallel_rank=data_parallel_rank):
 
         num_tokens = len(out.outputs[0].token_ids)
         if output_kind == RequestOutputKind.DELTA:
@@ -59,14 +63,22 @@ async def generate(engine: AsyncLLM,
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind",
+    [
+        RequestOutputKind.DELTA,
+        RequestOutputKind.FINAL_ONLY,
+    ],
+)
+@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
 @pytest.mark.asyncio
-async def test_load(output_kind: RequestOutputKind):
+async def test_load(output_kind: RequestOutputKind,
+                    data_parallel_backend: str):
 
     with ExitStack() as after:
 
         prompt = "This is a test of data parallel"
 
+        engine_args.data_parallel_backend = data_parallel_backend
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
@@ -80,9 +92,12 @@ async def test_load(output_kind: RequestOutputKind):
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
-
+                    generate(engine,
+                             request_id,
+                             prompt,
+                             output_kind,
+                             NUM_EXPECTED_TOKENS,
+                             data_parallel_rank=0)))
         # Confirm that we got all the EXPECTED tokens from the requests.
         done, pending = await asyncio.wait(tasks,
                                            return_when=asyncio.FIRST_EXCEPTION)
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py
index 68539c80b59ccd0311ac7239b159f096a166d99a..c05de5e4cb645494c022508e7d55a87b3719358a 100644
--- a/tests/v1/test_metrics_reader.py
+++ b/tests/v1/test_metrics_reader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import prometheus_client
 import pytest
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 1b77417a1bd350898928efe6825529eefb9f8073..e5eadfd4e9dad2a07586f0bcdf55f07a63b2bce7 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index ee490071f6a27a4df064ff967c33d53ce7223222..0ab4e0bf59cf5f8f1f9cd3c5e807fb089da75491 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import UserDict
 from dataclasses import dataclass
 from typing import Optional
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index b68f08385866b6dcd7f6c08842017d5473ce03e4..a3df882a9e29e5d24fd4ea42d18138635616a24e 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 1c0210b6a814b5d61624c181b078b7cea78fc0a6..7117a66c29584384c4c7b7e58ec3f2f8b3a7005f 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A basic correctness check for TPUs
 
 Run `pytest tests/v1/tpu/test_basic.py`.
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 01664598ccfdeec54ada483bc3e58da5e8ea8974..55fee4ee1ad43b2f43a60412857bd41c18f33afd 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test:
 
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index 8c87fc836b518084852993bdbba3aa071f278ec8..a61773a4f611bd1192d85a3be8eb6d227606df23 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
index 8faa5270b5930741d843dd377d62271193c93734..3a9d80847a16b82503c82aacce32cce0baae382f 100644
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from unittest.mock import ANY, patch
 
 import torch
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index 811833f73cdbcf26f2038791fac4ebf5602b1c07..f4a2d5ac853a8895c4700e543beb143f00c97cd9 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A basic performance regression test for TPUs
 
 Run `pytest tests/v1/tpu/test_perf.py`.
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 2bbeb3ddac91b551f8ee077c9c1948a4fdd7914a..198bb1e16ed9fd967806513a093246490fbf94a0 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 
 import pytest
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..916325e41b922e040b83995fe1ada36cab11fb85
--- /dev/null
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import tempfile
+
+import numpy as np
+import pytest
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+from vllm.config import set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tpu import TPUModelLoader
+
+
+def _setup_environment(model):
+    engine_args = EngineArgs(model=model, )
+    vllm_config = engine_args.create_engine_config()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        # Under single worker mode, full model is init first and then
+        # partitioned using GSPMD.
+        ensure_model_parallel_initialized(1, 1)
+    return vllm_config
+
+
+MESH = None
+
+
+def _get_spmd_mesh():
+    global MESH
+    if MESH is None:
+        xr.use_spmd()
+        num_devices = xr.global_runtime_device_count()
+        mesh_shape = (num_devices, 1)
+        device_ids = np.array(range(num_devices))
+        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+    return MESH
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "Qwen/Qwen2-1.5B-Instruct",
+        # Skip large models due to CI runner disk space limitations
+        # "meta-llama/Llama-3.1-8B-Instruct",
+        # "meta-llama/Llama-3.1-70B-Instruct",
+    ])
+def test_tpu_model_loader(model):
+    # Skip the 70B test if there are less than 8 chips
+    # TODO: Query using torch xla API, the query API is not working
+    # with SPMD now. However, This test is running under SPMD mode.
+    if '70B' in model and xr.global_runtime_device_count() < 8:
+        pytest.skip(
+            "Skipping 70B model if the TPU VM has less than 8 chips to \
+                     avoid OOM.")
+
+    vllm_config = _setup_environment(model)
+    loader = TPUModelLoader(load_config=vllm_config.load_config)
+    mesh = _get_spmd_mesh()
+    model = loader.load_model(vllm_config, vllm_config.model_config, mesh)
+    del model
+    gc.collect()
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index ff9217f8f3cab55125685b4ccbb1f39643408d1e..ca5c067b364e0040f1f317a1a3b819af90295917 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 
 import pytest
diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..b98570f01a7f2643bc1dfbdf599a4ab5738c5c7c
--- /dev/null
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+import tempfile
+
+import numpy as np
+import pytest
+import torch
+import torch_xla.distributed.spmd as xs
+import torch_xla.runtime as xr
+
+from vllm.config import set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.linear import QKVParallelLinear
+
+
+@pytest.fixture(autouse=True)
+def setup_environment():
+    # This is a fake config used for init dist env.
+    # QKVParallelLinear needs dist env to be initialized.
+    engine_args = EngineArgs(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+
+    vllm_config = engine_args.create_engine_config()
+
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        ensure_model_parallel_initialized(1, 1)
+        yield
+
+
+MESH = None
+
+
+def _get_spmd_mesh():
+    global MESH
+    if MESH is None:
+        xr.use_spmd()
+        num_devices = xr.global_runtime_device_count()
+        mesh_shape = (num_devices, 1)
+        device_ids = np.array(range(num_devices))
+        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+    return MESH
+
+
+@pytest.mark.parametrize("bias", [False, True])
+# `xr.use_spmd()` will set a global state, and this state is not reversible.
+# Therefore, non-SPMD tests should be run before SPMD tests.
+@pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()])
+@pytest.mark.parametrize("device", ['cpu', 'xla'])
+@torch.no_grad()
+def test_xla_qkv_linear(bias, mesh, device):
+    torch.manual_seed(123)
+
+    qkv_linear = QKVParallelLinear(
+        hidden_size=4096,
+        head_size=128,
+        total_num_heads=32,
+        total_num_kv_heads=8,
+        bias=bias,
+        params_dtype=torch.bfloat16,
+        return_bias=False,
+    )
+
+    qkv_linear.weight.data = torch.rand_like(qkv_linear.weight.data) / 10
+    if bias:
+        qkv_linear.bias.data = torch.rand_like(qkv_linear.bias.data)
+
+    xla_qkv_linear = XlaQKVParallelLinear(qkv_linear, mesh=mesh)
+
+    qkv_linear = qkv_linear.to(device)
+    xla_qkv_linear = xla_qkv_linear.to(device)
+    input_tensor = torch.rand(10, 4096, dtype=torch.bfloat16) / 10
+    input_tensor = input_tensor.to(device)
+
+    output = qkv_linear(input_tensor)
+    xla_output = xla_qkv_linear(input_tensor)
+    assert torch.allclose(output.cpu(), xla_output.cpu())
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 319b38b4ca09d43671739e90cbbb2166b0cd8b2e..0e7d305fef9ede79fb57b52b2c840af8adc530b4 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -1,34 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
-import unittest.mock as mock
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.attention.layer import Attention
+from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.worker.tpu_model_runner import (
     TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
     _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
-# Mock torch_xla module since it may not be available in the test environments
-torch_xla_patcher = mock.patch.dict(
-    "sys.modules", {
-        "torch_xla": mock.MagicMock(),
-        "torch_xla.core.xla_model": mock.MagicMock(),
-        "torch_xla.runtime": mock.MagicMock(),
-    })
-torch_xla_patcher.start()
 
-# Mock the PallasAttentionBackend
-pallas_attention_backend_patcher = mock.patch(
-    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
-pallas_attention_backend_patcher.start()
-
-
-@pytest.fixture
-def model_runner():
-    # Patchers have already been started at module level.
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -54,18 +43,19 @@ def model_runner():
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
+    return vllm_config
+
+
+def get_model_runner(vllm_config):
     device = "xla:0"  # Mocking TPU device
-    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
-        return TPUModelRunner(vllm_config, device)
+    return TPUModelRunner(vllm_config, device)
 
 
-@pytest.fixture(autouse=True, scope="session")
-def cleanup_patches():
-    yield
-    torch_xla_patcher.stop()
-    pallas_attention_backend_patcher.stop()
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    vllm_config = get_vllm_config()
+    return get_model_runner(vllm_config)
 
 
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
@@ -81,7 +71,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[0],
+                block_ids=([0], ),  # block_ids should be tuple[list[int]]
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -112,14 +102,35 @@ def _is_req_added(model_runner, req_id: str) -> bool:
 
 
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    """Check if the request state block IDs match the block table.
+    
+    This function handles both legacy BlockTable and new MultiGroupBlockTable
+    structures for backward compatibility.
+    """
+
     req_index = model_runner.input_batch.req_id_to_index[req_id]
-    block_table = model_runner.input_batch.block_table
+    multi_group_block_table = model_runner.input_batch.block_table
     req_state = model_runner.requests[req_id]
-    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
+
+    # Access the first block table from MultiGroupBlockTable
+    # This is safe since we currently only use single KV cache groups
+    block_table = multi_group_block_table[0]
+
+    # req_state.block_ids is now tuple[list[int], ...] for MultiGroupBlockTable
+    # Extract the first group's block IDs
+    if isinstance(req_state.block_ids[0], list):
+        # New format: tuple[list[int], ...] - extract first group
+        req_block_ids = req_state.block_ids[0]
+    else:
+        # Legacy format: list[int] - use directly
+        req_block_ids = req_state.block_ids
+
+    if block_table.num_blocks_per_row[req_index] != len(req_block_ids):
         return False
+
     num_blocks = block_table.num_blocks_per_row[req_index]
-    return (block_table.block_table_np[req_index, :num_blocks] ==
-            req_state.block_ids).all()
+    block_table_values = block_table.block_table_np[req_index, :num_blocks]
+    return (block_table_values == req_block_ids).all()
 
 
 def test_update_states_new_request(model_runner):
@@ -199,7 +210,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[],
+        new_block_ids=([], ),
         num_computed_tokens=0,
     )
 
@@ -341,3 +352,236 @@ def test_get_req_paddings():
     assert _get_req_paddings(1, 32) == [8, 16, 32]
     assert _get_req_paddings(8, 32) == [8, 16, 32]
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
+        model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 1_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(model_runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for each layer KV can be calculated as
+    # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    # max_context_len = available_memory / (page_size / block_size) / num_caches
+    # max_context_len = 5GB / (512KB / 128) / 2 = 655360
+    assert max_context_len == 655360
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 512kb)
+    kv_cache_config.num_blocks = 1
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 512KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 2 * 20480  # 20GB / 512KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == (2 * 655360)
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (512kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.kv_cache_tensors[0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 27741bd156be140b2a930ce85e0a9a02dab03e02..de6ebe4f6716bcef8ef360d7c284d3e35e61a94c 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from typing import Optional
@@ -9,8 +10,6 @@ import torch
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -24,27 +23,6 @@ CUDA_DEVICES = [
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def get_kv_cache_config() -> KVCacheConfig:
-    return KVCacheConfig(
-        num_blocks=10,
-        tensors={
-            "layer.0": KVCacheTensor(size=1024),
-        },
-        kv_cache_groups=[
-            KVCacheGroupSpec(
-                layer_names=["layer.0"],
-                kv_cache_spec=FullAttentionSpec(
-                    block_size=1,
-                    num_kv_heads=1,
-                    head_size=16,
-                    dtype=torch.float16,
-                    use_mla=False,
-                ),
-            ),
-        ],
-    )
-
-
 def _compare_objs(obj1, obj2):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
     attr_names = set([
@@ -225,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         sampling_params=_create_sampling_params(),
         mm_inputs=[],
         mm_positions=[],
-        block_ids=[[]],
+        block_ids=([], ),
         generator=None,
         num_computed_tokens=len(output_token_ids),
         output_token_ids=output_token_ids,
@@ -251,7 +229,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
@@ -341,7 +319,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
     ref_input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -350,7 +328,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
 
     reqs: list[CachedRequestState] = []
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b8c3d88617d0df44143bf8e59dc921e65b8e98e8..994432dfd593bcaf3a50c7a963f28faf1cc20f2c 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1,10 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
 
 import pytest
 
+from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+                         SchedulerConfig, VllmConfig, set_current_vllm_config)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -13,28 +20,33 @@ from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
+BLOCK_SIZE = 16
+NUM_BLOCKS = 10
+DEVICE = "cuda"
+
 
 def initialize_kv_cache(runner: GPUModelRunner):
     """
     Only perform necessary steps in GPUModelRunner.initialize_kv_cache()
     """
+    attn_spec = FullAttentionSpec(
+        block_size=BLOCK_SIZE,
+        num_kv_heads=runner.model_config.get_num_kv_heads(
+            runner.parallel_config),
+        head_size=runner.model_config.get_head_size(),
+        dtype=runner.kv_cache_dtype,
+        use_mla=False,
+    )
+    tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
     kv_cache_config = KVCacheConfig(
-        num_blocks=10,
-        tensors={
-            "layer.0": KVCacheTensor(size=1024),
-        },
+        num_blocks=NUM_BLOCKS,
+        kv_cache_tensors=[
+            KVCacheTensor(size=tensor_size, shared_by=["layer.0"]),
+        ],
         kv_cache_groups=[
-            KVCacheGroupSpec(
-                layer_names=["layer.0"],
-                kv_cache_spec=FullAttentionSpec(
-                    block_size=16,
-                    num_kv_heads=runner.model_config.get_num_kv_heads(
-                        runner.parallel_config),
-                    head_size=runner.model_config.get_head_size(),
-                    dtype=runner.kv_cache_dtype,
-                    use_mla=False,
-                ))
-        ])
+            KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec)
+        ],
+    )
     runner.kv_cache_config = kv_cache_config
     runner.input_batch = InputBatch(
         max_num_reqs=runner.max_num_reqs,
@@ -43,13 +55,14 @@ def initialize_kv_cache(runner: GPUModelRunner):
         device=runner.device,
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
-        block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size,
+        block_sizes=[
+            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+        ],
     )
     runner.initialize_attn_backend(kv_cache_config)
 
 
-@pytest.fixture
-def model_runner():
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -65,7 +78,7 @@ def model_runner():
         seed=42,
     )
     cache_config = CacheConfig(
-        block_size=16,
+        block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
@@ -77,13 +90,25 @@ def model_runner():
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
     )
+    return vllm_config
+
 
-    device = "cuda"
-    runner = GPUModelRunner(vllm_config, device)
+@pytest.fixture
+def model_runner():
+    vllm_config = get_vllm_config()
+    model_config = vllm_config.model_config
+    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
+    head_size = model_config.get_head_size()
+    vllm_config.compilation_config.static_forward_context[
+        "layer.0"] = Attention(num_heads, head_size, 0.1)
+    runner = GPUModelRunner(vllm_config, DEVICE)
     initialize_kv_cache(runner)
     return runner
 
 
+model_runner_2 = model_runner
+
+
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
     new_reqs = []
     num_scheduled_tokens = {}
@@ -97,7 +122,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[[0]],
+                block_ids=([0], ),
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -225,7 +250,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[[]],
+        new_block_ids=([], ),
         num_computed_tokens=0,
     )
 
@@ -321,3 +346,275 @@ def test_update_states_request_unscheduled(model_runner):
 
     assert _is_req_added(model_runner, req_ids[1])
     assert not _is_req_scheduled(model_runner, req_ids[1])
+
+
+def test_kv_cache_stride_order(monkeypatch, model_runner):
+    # This test checks if GPUModelRunner initializes correctly when an attention
+    # backend enforces a non-default KV cache stride order.
+    n_heads = model_runner.model_config.get_num_kv_heads(
+        model_runner.parallel_config)
+    expected_kv_cache_shape = [
+        2, NUM_BLOCKS, BLOCK_SIZE, n_heads,
+        model_runner.model_config.get_head_size()
+    ]
+    # TODO mla test
+    default_stride = list(range(5))
+    # Permutation that gets you back to expected kv shape
+    rnd_stride = tuple(random.sample(default_stride, len(default_stride)))
+
+    def rnd_stride_order():
+        return rnd_stride
+
+    # Patch the attention backend class and re-trigger the KV cache creation.
+    for attn_backend in model_runner.attn_backends:
+        monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
+                            rnd_stride_order)
+
+    model_runner.attn_backends = []
+    model_runner.attn_metadata_builders = []
+    model_runner.initialize_kv_cache(model_runner.kv_cache_config)
+
+    # Shape is unchanged, but layout may differ
+    kv_cache_shape = model_runner.kv_caches[0].shape
+    assert list(kv_cache_shape) == expected_kv_cache_shape
+    if default_stride == rnd_stride:
+        assert all(kv.is_contiguous() for kv in model_runner.kv_caches)
+    else:
+        assert all(not kv.is_contiguous() for kv in model_runner.kv_caches)
+
+
+def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
+    # In this test, model_runner loads model + weights in one go, while
+    # model_runner_2 loads dummy weights first then load real weights inplace
+    model_runner.load_model()
+    original_load_format = model_runner_2.load_config.load_format
+    model_runner_2.load_config.load_format = "dummy"
+    model_runner_2.load_model()  # Initial model loading with dummy weights
+    assert str(model_runner.get_model().state_dict()) != str(
+        model_runner_2.get_model().state_dict())
+    model_runner_2.load_config.load_format = original_load_format
+    model_runner_2.load_model()  # Load real weights inplace
+    assert str(model_runner.get_model().state_dict()) == str(
+        model_runner_2.get_model().state_dict())
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 32kb)
+    kv_cache_config.num_blocks = 1
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 655360  # 20GB / 32KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 2 * 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (32kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.kv_cache_tensors[0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index c039431494c4edd0648f6f13feb3d2fef97a4540..83be8bdce85cf9b05e4f84187520b70917044993 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index 1d1219fbeffa154aff3f8112efe0192b3e504f36..2818428de4a733ae11835e49f66c6f8538036364 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vllm_utils is a package for vLLM testing utilities.
 It does not import any vLLM modules.
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 3b25980cb94637e22c21ff7f01ccdfa648b01b36..49fd083ef19c88c17c7b30dcc4ace15b5140dba1 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import dataclasses
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index 27077f13de24f8db6e89b420da8f0ee1227d879a..9454221b273e6296cf4b289ca7c86d0b81aca359 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import dataclasses
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 9f99b3725fe41b2a9458427f6d19d872ed307087..3aabae099073ee2065bd95ed20c8ae4e9351ac1d 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
index 372d71a78d0a7ffcbc796fa3e91dd5f9f18707c5..3f202d4dbe948161755bedbb6039843458aca5c0 100644
--- a/tests/worker/conftest.py
+++ b/tests/worker/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 3e237aacc8c60d42558859da2b83e67c90dcf286..35ac90b38e8403db7945f7fd51ffef1c52369c36 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index a41fc52170fee6989728371641956d39b68fb543..a5e61128d1e938eb5fea11dc6a8705b5dd71e55c 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index ae4b536524be0823564da87fee73aa3fdfc82ee8..0be25aa2fc35dd0c7d4cae53881adf203cbf3472 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 22466105b8abaec18585eab800341e024ebf3d86..d8767f700b576fbfc8fdfc3362c415c08ace5935 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 3ab8070999b009b406e2574da44a96265d6d520d..6d9f404ac207bb096a0628ecaf4bcf827f544219 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
index 709befc53207c26b5cb948f98aa97a2d0f68fc8d..92914186b16e02050748e28c3467d0f2e4597a48 100644
--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 
-SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0"
+SPDX_HEADER = (
+    "# SPDX-License-Identifier: Apache-2.0\n"
+    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
 SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
 
 
diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py
index 18c9726a11ac01a2212918e1bdc9139301b767ec..77b2dfc391889487021f27d65d01a84fa845a77b 100644
--- a/tools/check_triton_import.py
+++ b/tools/check_triton_import.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
 
diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py
index b55c4a94eac80fa1b010ff7d2b6c6a267290e523..63ceee5829aba3d93bb080e57072a9569c1594c4 100644
--- a/tools/enforce_regex_import.py
+++ b/tools/enforce_regex_import.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import subprocess
@@ -58,6 +59,9 @@ def main() -> int:
         if not Path(filepath).exists():
             continue
 
+        if filepath == "setup.py":
+            continue
+
         violations = check_file(filepath)
         if violations:
             print(f"\n❌ {filepath}:")
diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index 5c98e999da335578621fda6cee1a02c7f2da5852..f1479146f053c6d42959b37bb2ddd6025bd9651a 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -1,11 +1,10 @@
 Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
 
-Here we break down the requirements in 3 steps:
+Here we break down the requirements in 2 steps:
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
-2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image.
-3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
+2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
-2 and 3 are necessary for multi-node deployment.
+2 is necessary for multi-node deployment.
 
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
 
@@ -21,7 +20,6 @@ bash install_python_libraries.sh
 
 ```bash
 bash install_python_libraries.sh
-sudo bash install_system_libraries.sh
-sudo bash install_system_drivers.sh
+sudo bash configure_system_drivers.sh
 sudo reboot # Reboot is required to load the new driver
 ```
diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cf15c1daccaecab664e7bffd77cf63e570887ef4
--- /dev/null
+++ b/tools/ep_kernels/configure_system_drivers.sh
@@ -0,0 +1,7 @@
+set -ex
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+update-initramfs -u
+
+echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index e5632f4b58758e5409f3a17964cddabf1229bd53..83643c084bf9a66daf23f2892d3966b96c507725 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -13,16 +13,6 @@ fi
 # install dependencies if not installed
 pip3 install cmake torch ninja
 
-# build gdrcopy, required by nvshmem
-pushd $WORKSPACE
-wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
-mkdir -p gdrcopy_src
-tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
-pushd gdrcopy_src
-make -j$(nproc)
-make prefix=$WORKSPACE/gdrcopy_install install
-popd
-
 # build nvshmem
 pushd $WORKSPACE
 mkdir -p nvshmem_src
@@ -34,26 +24,30 @@ git init
 git apply -vvv nvshmem.patch
 
 # assume CUDA_HOME is set correctly
-export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
 export NVSHMEM_SHMEM_SUPPORT=0
 export NVSHMEM_UCX_SUPPORT=0
 export NVSHMEM_USE_NCCL=0
-export NVSHMEM_IBGDA_SUPPORT=1
 export NVSHMEM_PMIX_SUPPORT=0
 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=1
-export NVSHMEM_IBRC_SUPPORT=1
-
-# remove MPI dependency
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
 export NVSHMEM_BUILD_TESTS=0
 export NVSHMEM_BUILD_EXAMPLES=0
 export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
 
-cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-
-cd $WORKSPACE/nvshmem_build/
-make -j$(nproc)
-make install
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install
 
 popd
 
diff --git a/tools/ep_kernels/install_system_drivers.sh b/tools/ep_kernels/install_system_drivers.sh
deleted file mode 100644
index 8b0669ef404ff89e42f99ad6bb707beb8461f798..0000000000000000000000000000000000000000
--- a/tools/ep_kernels/install_system_drivers.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy driver
-pushd $WORKSPACE
-cd gdrcopy_src
-./insmod.sh
-# run gdrcopy_copybw to test the installation
-$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
-
-# turn on IBGDA
-echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
-update-initramfs -u
-
-echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_system_libraries.sh b/tools/ep_kernels/install_system_libraries.sh
deleted file mode 100644
index c148d5443900a845adf302bd4045c91f72d89e06..0000000000000000000000000000000000000000
--- a/tools/ep_kernels/install_system_libraries.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy system packages
-pushd $WORKSPACE
-cd gdrcopy_src/packages
-apt install devscripts -y
-CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
-dpkg -i *.deb
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 9601b578eb97cb70f2d1388294913d166cb7dc18..209c3a576aeed77b7babdcf469cf5dd14895c258 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 8ec3dfc97a73486131f05c4fecd5ce260ad176f4..038d3c44f043a44355a5c73561e40bd7676c753e 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 011af25229f4bd7f0d655a232ebd3b402a9d8cef..7368ae95313d2ebb1925162eef19189ee75e6cc2 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
diff --git a/use_existing_torch.py b/use_existing_torch.py
index 7d352c6ca6fa75785201d317584f7ddf9d5f208f..a9f79e16981c4c7bdc489799816bba0cb7630527 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 52022fb8f016879f3fa04f1e8f027344d3c06ad4..6232b657e82842be045b8db187bd058482822dbf 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 # The version.py should be independent library, and we always import the
 # version library first.  Such assumption is critical for some customization.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 708136b59ea72c660f01cdff6d5cab6bfb2eb630..9c9bff164ecfcfdc108f13d289633d13579ac612 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import importlib
@@ -281,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def apply_repetition_penalties_torch(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
+        1, logits.size(1))
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
+
+def apply_repetition_penalties_cuda(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask,
+                                             repetition_penalties)
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    if current_platform.is_cuda() and logits.is_contiguous():
+        apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
+                                        repetition_penalties)
+    else:
+        apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                         repetition_penalties)
+
+
 def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
                            input_tokens: torch.Tensor,
                            sampled_token_ids: torch.Tensor,
@@ -811,11 +851,16 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
-def get_cutlass_moe_mm_data(
-        topk_ids: torch.Tensor, expert_offsets: torch.Tensor,
-        problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor,
-        input_permutation: torch.Tensor, output_permutation: torch.Tensor,
-        num_experts: int, n: int, k: int):
+def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
+                            expert_offsets: torch.Tensor,
+                            problem_sizes1: torch.Tensor,
+                            problem_sizes2: torch.Tensor,
+                            input_permutation: torch.Tensor,
+                            output_permutation: torch.Tensor,
+                            num_experts: int,
+                            n: int,
+                            k: int,
+                            blockscale_offsets: Optional[torch.Tensor] = None):
     """
     Prepare data necessary to perform CUTLASS grouped matrix multiplications
     used in CUTLASS-based fused MoE.
@@ -833,19 +878,63 @@ def get_cutlass_moe_mm_data(
                          before executing the MMs.
     - output_permutation: Permutation that must be used to shuffle the output
                           after executing the MMs.
+    - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
+                          mark at which block scale index each expert begins
+                          its computation. The number of block scale rows
+                          computed with expert E is blockscale_offsets[E + 1] -
+                          blockscale_offsets[E]
     """
     return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
                                                 problem_sizes1, problem_sizes2,
                                                 input_permutation,
                                                 output_permutation,
-                                                num_experts, n, k)
+                                                num_experts, n, k,
+                                                blockscale_offsets)
+
+
+def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
+    """
+    Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
+    This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
+    """
+    num_tokens_permuted = dst2src_map.shape[0]
+    output_tensor = torch.empty((num_tokens_permuted, input_tensor.shape[1]),
+                                device=input_tensor.device,
+                                dtype=input_tensor.dtype)
+    torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
+    return output_tensor
+
+
+def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
+                                 problem_sizes1: torch.Tensor,
+                                 problem_sizes2: torch.Tensor,
+                                 expert_num_tokens: torch.Tensor,
+                                 num_local_experts: int, padded_m: int, n: int,
+                                 k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in expert_num_tokens (token count per expert) and
+    non_zero_expert_idxs (consecutive indices of experts with non-zero token 
+    counts) and uses them to compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation.
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    """
+    return torch.ops._C.get_cutlass_pplx_moe_mm_data(
+        expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens,
+        num_local_experts, padded_m, n, k)
 
 
 def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                    b_tensors: torch.Tensor, a_scales: torch.Tensor,
                    b_scales: torch.Tensor, expert_offsets: torch.Tensor,
                    problem_sizes: torch.Tensor, a_strides: torch.Tensor,
-                   b_strides: torch.Tensor, c_strides: torch.Tensor):
+                   b_strides: torch.Tensor, c_strides: torch.Tensor,
+                   per_act_token: bool, per_out_ch: bool):
     """
     A single grouped matrix multiplication used in CUTLASS-based fused MoE.
     The function executes fp8-quantized OUT = AB matrix multiplication.
@@ -860,7 +949,7 @@ def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
     return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors,
                                        a_scales, b_scales, expert_offsets,
                                        problem_sizes, a_strides, b_strides,
-                                       c_strides)
+                                       c_strides, per_act_token, per_out_ch)
 
 
 def cutlass_fp4_moe_mm(a_tensors: torch.Tensor, b_tensors: torch.Tensor,
@@ -1090,14 +1179,12 @@ def scaled_fp4_experts_quant(
     expert_offsets: torch.Tensor,
     blockscale_offsets: torch.Tensor,
     topk: int,
-    expert_map: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP4 and return quantized tensor and scale, for
     packed MoE Inputs.
     Args:
-        input: The input tensor to be quantized to FP4
-        expert_map: The expert map tensor
+        input_tensor: The input tensor to be quantized to FP4
         input_global_scale: A scalar scaling factor for the entire tensor.
         expert_offsets: The expert offsets tensor
         blockscale_offsets: The blockscale offsets tensor
@@ -1109,14 +1196,13 @@ def scaled_fp4_experts_quant(
     assert input_tensor.ndim == 2, (
         f'input.ndim needs to be == 2, but got {input_tensor.ndim}.')
 
-    input_tensor = input_tensor[
-        expert_map] if expert_map is not None else input_tensor
-    m_numtopk, k = input_tensor.shape
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
     # from running out of memory. This value can also be increased to support
     # larger models.
     MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
+    m_numtopk, k = input_tensor.shape
+
     assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
         f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
         f"{MAX_TOKENS_PER_EXPERT})"
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index a9a624b85abc5f748c5c25d6bfa980dae4735115..ae63e06030dd16935ce92b7c36dc3664146ca037 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
index 9cc2b181fc7cc9d5d54f726e6af4ab6510a4d64c..9753a088065655cc6fc190b4ad36eb58a5457cd3 100644
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index a84fbea2e444a24c6ff16bb24bb75c5098b0d078..7b685880a9e6c74e09ef4a27de31917dbe91ce9e 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Optional, TypeVar
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 2b604b91bbb6b43e0e5b63fc555c7cef15c88f3d..8135b54ba19f61d0c10e4f8b1316ff3cc5feacd9 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
index 46e9629e1f55fc3d8194b54e2e5eef6db5e127cd..a1a56b6bbd4ba14efee71a2fba6f067731aa8628 100644
--- a/vllm/adapter_commons/utils.py
+++ b/vllm/adapter_commons/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index 3c1d26404c9905fd79c20eaccea8ee1c7a3f764e..07e85d138ac5073c9ae08808f0de353c6de78c5c 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Any, Optional
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index a21eb7f599faadfd127a19b96c478acd9a8d2958..1c16230849bca6f64ca6b181707ae35307c84111 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from pathlib import Path
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 03f3b9dabf1438662d533be53d3abacc679c472b..31cde431b5b6ad890841f535ffcf0aac654c79c8 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from pathlib import Path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index d8cca9b74edd521f99af5fbb223c82bd29177566..c977242a3d48427255971cf9f3836f445ca5a4ad 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Literal
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index bf06746a9ff66e39cf02e4b39b991d45f03a0f11..01834aeeb6c12f02658035288dc73f6899c6ec09 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from functools import lru_cache
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 85c5715faba7f11c95db33a9eb82a7f3946613f8..344040586a53266b447b17fefc984e4b58b0c6f0 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index f3d6ffaeb8f45959dd4a3de23d2370f3d2f7c246..0ba5a5bf94c9b2566575f0b9ce06a51628f01410 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
@@ -269,6 +270,7 @@ class AttentionImpl(ABC, Generic[T]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index ea4f840729b48e148e2e9cd0c2d8b1b46ddcbf36..c1663516de3583a80ca9fc1256dfea312effacf9 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -305,7 +306,10 @@ class BlocksparseFlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
index 4567893a9ef7c79ad53b74711fae5c8d55ef2a63..793cb87b743420630a6e9a2118c4debfcdb5a949 100644
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -177,7 +178,7 @@ class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_kv_len=max_kv_len,
-            query_start_loc=query_start_loc,
+            prefill_query_start_loc=query_start_loc,
             kv_start_loc=kv_start_loc,
             max_decode_seq_len=input_data.max_decode_seq_len,
             num_prefills=input_data.num_prefills,
@@ -205,12 +206,13 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
@@ -262,8 +264,8 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
             key=k,
             value=v_padded,
             out=output,
-            seqlen_q=prefill_metadata.query_start_loc,
-            seqlen_k=prefill_metadata.query_start_loc,
+            seqlen_q=prefill_metadata.prefill_query_start_loc,
+            seqlen_k=prefill_metadata.prefill_query_start_loc,
             max_seqlen_q=prefill_metadata.max_query_len,
             max_seqlen_k=prefill_metadata.max_query_len,
             pdropout=0.0,
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
index eceab1f1ac9a31887a624838e9742d19e072ec07..963bccdf21bc08cca267c3b05509bec7a9bc2d95 100644
--- a/vllm/attention/backends/dual_chunk_flash_attn.py
+++ b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with Dual chunk flash attention and sparse attention.
 """
 import math
@@ -289,9 +290,12 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         layer_idx: int = -1,
         dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 7f8f720eee0aedd8cb2a786a950ae9333c4e4ba8..73e3772682e69618c78e2d8e9d5aa5342e553c04 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
@@ -617,8 +618,11 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 37b20d0739f7073c6196b2235396edf53af7bdc2..a3937760f03b89a9ac6f744f710283023d338fba 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import os
@@ -935,8 +936,11 @@ class FlashInferImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in FlashInfer is not supported yet, it will fall"
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 0e62748ddbee49b48d946c20f0bf205426b5eec6..e185d0260d0a055185fd0c5c1fcd4921c15b1af0 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -183,12 +184,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str] = None,
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index d701c59a234f8e1ee069501c6003bfae4b742a1a..9bd513fd894f5b2ba6514a3f806b22f220ff2539 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
@@ -109,9 +110,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
         super(AttentionImpl, self).__init__()
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in HPU is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index f322c7b3dd6a2be9e212cf18db98c7c440bc88a0..5051c6a7cc4fd61406d2834d201ee15be3db6822 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
@@ -122,8 +123,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in Ipex is not supported yet, it will fall"
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index eb60329873d1987066634e1ae1ee2676bbb63b6d..0088ac466909150347c85868f2ce339d1e404fad 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 # MLA Common Components
 
@@ -1001,6 +1002,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         blocksparse_params: Optional[Dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
         # MLA Specific Arguments
         q_lora_rank: Optional[int],
         kv_lora_rank: int,
@@ -1010,6 +1012,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing not supported in V0.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 19642a939b481a9eb8da847f1300dea436fb9174..7ad67615d33d95f46f499b58ee2fd03214644fcb 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -108,8 +109,11 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in Pallas is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index f1def25c89cffe79e416c2d340ce332ee72a77d7..820ddcab77d71089909b385293a2d174a647f73c 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index b048220020f143973a64a9a02fecb7e06a13c307..1edf34351db3f362e8f60b95edf674bd9094af37 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -132,8 +133,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         super().__init__(input_builder)
-        assert self.runner.model_config.max_model_len == 32768,\
-                "AITER MLA requires max model len to be set to 32768"
         assert self.block_size == 1, "AITER MLA requires only block size 1."
 
     def prepare(self):
@@ -371,12 +370,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 2697f1f9aad608faa24d0cb03902000499fd4dcf..703b59041ac680133b4e1041d0711a4f4af9a416 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
@@ -493,8 +494,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in ROCm Flash Attention is not supported yet, it "
@@ -770,8 +774,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                                       and layer._v_scale and layer._prob_scale
                                       and self.kv_cache_dtype == "fp8")
                     full_scales = (
-                        layer._q_scale, layer._k_scale, layer._v_scale,
-                        layer._prob_scale) if use_fp8_scales else None
+                        layer._q_scale.item(), layer._k_scale.item(),
+                        layer._v_scale.item(),
+                        layer._prob_scale.item()) if use_fp8_scales else None
                     self.triton_attn_func(
                         query,
                         key,
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c1bd638f2605d3d33d15581e8b8d18be90f2df7a..23231c323f13911e48dd78cd0eccc891087ddc35 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
@@ -86,10 +87,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     # For chunked prefill only
     max_query_len: Optional[int] = None
     max_kv_len: Optional[int] = None
-    query_start_loc: Optional[torch.Tensor] = None
+    prefill_query_start_loc: Optional[torch.Tensor] = None
     kv_start_loc: Optional[torch.Tensor] = None
     prefill_block_tables: Optional[torch.Tensor] = None
 
+    # For V1 logits index only
+    query_start_loc: Optional[torch.Tensor] = None
+
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
     encoder_seq_lens: Optional[List[int]] = None
@@ -374,7 +378,7 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_kv_len=max_kv_len,
-            query_start_loc=query_start_loc,
+            prefill_query_start_loc=query_start_loc,
             kv_start_loc=kv_start_loc,
             max_decode_seq_len=input_data.max_decode_seq_len,
             num_prefills=input_data.num_prefills,
@@ -404,8 +408,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
@@ -466,6 +473,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+
+        # For warming-up
+        if attn_metadata is None:
+            return query
+
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -533,8 +545,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
         output = torch.empty_like(query)
         if prefill_meta := attn_metadata.prefill_metadata:
-            assert attn_metadata.seq_lens is not None
             if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
+                assert attn_metadata.seq_lens is not None
                 self._run_sdpa_forward(output,
                                        query,
                                        key,
@@ -551,7 +563,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
                     query[:prefill_meta.num_prefill_tokens, :, :],
                     key_cache,
                     value_cache,
-                    prefill_meta.query_start_loc,
+                    prefill_meta.prefill_query_start_loc,
                     prefill_meta.kv_start_loc,
                     prefill_meta.max_query_len,
                     prefill_meta.max_kv_len,
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 6945c2c6e29cd33666274a1f79286d490130b38b..e06f7d54e3421e8bd580c2bcb56df87de8a05559 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Dict, List, Optional, Type
 
@@ -37,12 +38,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index a281c9771a82e9b2596ea1f9db8132b8391da9e6..e3f02a193614a368604b168d89983dc4a94ae00b 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index a9d4a70b55a8c10909104c5f81b07ec6a4faaad1..04ef928b7d7b3e721ade7abb1b37c38b30f21a91 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with xFormers and PagedAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -389,8 +390,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9e4fbe0b4c6c28a2f3a1de42e448851a01dfeca6..a5fbd1a1c0166d66ff48bae3c6874dc198c53991 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 from typing import Any, Dict, List, Optional
 
@@ -20,6 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.utils import validate_kv_sharing_target
 
 
 class Attention(nn.Module):
@@ -49,6 +51,7 @@ class Attention(nn.Module):
         use_mla: bool = False,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -134,7 +137,7 @@ class Attention(nn.Module):
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap, attn_type,
-                             **extra_impl_args)
+                             kv_sharing_target_layer_name, **extra_impl_args)
         self.backend = backend_name_to_enum(attn_backend.get_name())
         self.dtype = dtype
 
@@ -152,6 +155,19 @@ class Attention(nn.Module):
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+
+        if kv_sharing_target_layer_name is not None:
+            if not envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "Cross-layer KV sharing is not supported in V0.")
+
+            validate_kv_sharing_target(
+                prefix,
+                kv_sharing_target_layer_name,
+                compilation_config.static_forward_context,
+            )
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
         # use a placeholder kv cache tensor during init, which will be replaced
         # by bind_kv_cache
         # this variable will not be accessed if use_direct_call is True
diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
index bc87ce33a30158a7ec79633013ae004d9eabf2e0..05fa9d11f22837acf45306329242a4c8fae1c6ca 100644
--- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 6ab69ea5b4098d8a1cc0f3728c5f013f9564718d..c6f6cc29793f46735ff729bcaf4aa80ca15da1c2 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index e64fc1139713e9014a36624ba653a96eab464b42..445720c709c4746e54cf6eecc258e854405576b1 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Helper functions for 3D sparse pattern
 # These function are not optimized and very inefficient.
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 785799b6bf684b9674337da31e3a9371781791bf..4f839348e5222b044fff326ecd0a42ea8b1fff09 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Authors:
 #  - Burkhard Ringlein <ngl@zurich.ibm.com>
@@ -264,8 +265,8 @@ def chunked_prefill_paged_decode(
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
     if "fp8" in kv_cache_dtype:
-        assert key_cache.dtype == torch.uint8
-        assert value_cache.dtype == torch.uint8
+        assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
 
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             target_dtype = current_platform.fp8_dtype()
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 18b69a6b3ddf8c641dd585fc95df3eee0886de3d..b85f27ac417cf0ce779a254c4204b6277566705c 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
 from typing import Optional, Tuple
 
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index a97c36338d3c5ae3849508bbed329956c75618d7..412dd20ec1deb6dcda889ecef9fa4f7aadd625f0 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 1702203b183469d038fcbaaf7af7febafd3e90a5..b7e4ba4d7416afcc025a9303189c551d50940498 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index f9fcfe6a63386de28a5ad3c386b85247f8bcf4fc..5cb1a47394cf6bb04ec13a8810e54143ba167c53 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 8c9145bb99e8cc767fd5f6438d1b4def398fceaf..e28ff7e8b4ed92b1efa009122c2c044ae557cc8f 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import neuronxcc.nki.isa as nisa
 import neuronxcc.nki.language as nl
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 827c3041a682ece55258499262c97b6bb2c2e3c9..c6d1501e2757844bb70754213ed9df2f5d3df063 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 86d256b630bf5e596c368290d426cfd7c37e6bcb..13bef96722d2bef657c418baa98af63c8ebe72c4 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
@@ -744,8 +745,8 @@ def context_attention_fwd(q,
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
     if "fp8" in kv_cache_dtype:
-        assert (k_cache.dtype == torch.uint8)
-        assert (v_cache.dtype == torch.uint8)
+        assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
 
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             target_dtype = current_platform.fp8_dtype()
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
index 421891ab6b733954b85cf380001f191c7c1a9e4a..cce6b4639460610609a8dc50e58001afd32b6d54 100644
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 0f3cf1842c8052a9fa3646c36b34809f355c9964..ad97152e208b81e4883ebbabfbc472c906f269f8 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import aiter as rocm_aiter
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index fb983907e375eaa746702c4bce8facaf726f19f9..c27b377aebe99ccb6c82c39471f651a265582653 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 8940d0b66225829fb6ff7384763598d74539fd6f..a26e713b1c6247061c1961df9720aa6d0224337d 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,236 +1,34 @@
+#!/usr/bin/env python
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Fused Attention
 ===============
 
-This is a Triton implementation of the Flash Attention v2 algorithm
-See https://tridao.me/publications/flash2/flash2.pdf
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
 
-Credits:
-AMD Triton kernels team
-OpenAI kernel team
-
-Currently only the forward kernel is supported, and contains these features:
+Features supported:
 
 1) Fwd with causal masking
-2) Arbitrary Q and KV sequence lengths
-3) Arbitrary head sizes
-4) Multi and grouped query attention
-5) Variable sequence lengths
-6) ALiBi and matrix bias
-7) FP8 support
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
 
-"""
+Not currently supported:
 
-from typing import Optional
+1) Non power of two head dims
+
+"""
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.platforms.rocm import on_gfx1x
 from vllm.triton_utils import tl, triton
 
-SUPPORTED_LAYOUTS = ['thd', 'bhsd', 'bshd']
-
-default_eight_bit_dtype_triton = tl.float8e4b8
-default_eight_bit_dtype_torch = current_platform.fp8_dtype()
-default_float8_info = torch.finfo(default_eight_bit_dtype_torch)
-
-FP8_MIN = triton.language.constexpr(default_float8_info.min)
-
-# According to https://github.com/vllm-project/vllm/blob/main
-#              /csrc/quantization/utils.cuh#L31,
-# need to make the max for the uz datatype be 224.0 for accuracy reasons.
-FP8_MAX = triton.language.constexpr(
-    default_float8_info.max if default_eight_bit_dtype_torch !=
-    torch.float8_e4m3fnuz else 224.0)
-
-
-class MetaData:
-    cu_seqlens_q = None
-    cu_seqlens_k = None
-    max_seqlens_q = 0
-    max_seqlens_k = 0
-    bias = None
-    alibi_slopes = None
-    causal = False
-    num_contexts = 0
-    varlen = False
-    eight_bit = False
-    layout = None
-    return_encoded_softmax = False
-    eight_bit_dtype_triton = default_eight_bit_dtype_triton
-    eight_bit_dtype_torch = default_eight_bit_dtype_torch
-    output_dtype = None
-
-    # Note about layouts:
-    #
-    # thd - [num_tokens, num_heads, head_size]
-    # bshd - [batch_size, seq_len, num_heads, head_size]
-    # bhsd - [batch_size, num_heads, seq_len, head_size]
-    #
-    # This is for each tensor, all tensors must have same layout.
-    # Q can have num_heads and seq_len differ from  from K and V,
-    # however K and V must agree on this.
-    #
-    # Notes about varlen and bias:
-    # Only one or the other is implemented, meaning can't combine
-    # both varlen and bias right now.
-    #
-    # Note about quantization:
-    # Only 8-bit quantization supported (for now) and specifically fp8.
-    # Scales must be tensors.
-    # o_scale: This is 'output scaling', but comes from parameter called
-    # 'input_scale', this is applied to the output from the kernel.
-    # o_scale should be None if none of the other quantization parameters
-    # are used.
-    #
-    # NOTE: Object is in a tentatively good state after initialized, however,
-    # to verify, call check_args(q,k,v,o) where o is the output tensor.
-    def __init__(
-        self,
-        sm_scale=1.0,
-        layout=None,  # layout can be 'bshd', 'bhsd', or 'thd'
-        output_dtype=None,
-        max_seqlens_q=0,
-        max_seqlens_k=0,
-        # varlen params
-        cu_seqlens_q=None,  # only 'thd' layout supported for varlen 
-        cu_seqlens_k=None,
-        # quant params
-        q_descale=None,
-        k_descale=None,
-        v_descale=None,
-        p_scale=None,
-        o_scale=None,
-        # bias params
-        bias=None,  # varlen not implemented for bias
-        seqlen_q=None,
-        seqlen_k=None,
-        # alibi params
-        alibi_slopes=None,
-        alibi_batch=None,
-        alibi_nheads=None,
-        # causal
-        causal=None,
-    ):
-        self.sm_scale = sm_scale
-        self.output_dtype = output_dtype
-        self.max_seqlens_q = max_seqlens_q
-        self.max_seqlens_k = max_seqlens_k
-        self.layout = layout
-        if cu_seqlens_q is not None or cu_seqlens_k is not None:
-            assert cu_seqlens_q is not None and cu_seqlens_k is not None
-            assert layout is None or layout not in [
-                'bshd', 'bhsd'
-            ], "Varlen only implemented for thd layout"
-            self.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
-        quant_params = [q_descale, k_descale, v_descale, p_scale, o_scale]
-        if any(x is not None for x in quant_params):
-            p_descale = 1.0 / p_scale if p_scale is not None else None
-            self.set_eight_bit_params(q_descale, k_descale, v_descale, p_scale,
-                                      p_descale, o_scale)
-        if bias is not None:
-            self.need_bias(bias, seqlen_q, seqlen_k)
-        if alibi_slopes is not None:
-            self.need_alibi(alibi_slopes, alibi_batch, alibi_nheads)
-        if causal is not None and causal:
-            self.need_causal()
-
-    def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k):
-        self.varlen = True
-        self.layout = 'thd'
-        self.cu_seqlens_q = cu_seqlens_q
-        self.cu_seqlens_k = cu_seqlens_k
-        # Without "varlen", there should still be one sequence.
-        assert len(cu_seqlens_q) >= 2
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-        self.num_contexts = len(cu_seqlens_q) - 1
-        for i in range(0, self.num_contexts):
-            self.max_seqlens_q = max(
-                cu_seqlens_q[i + 1].item() - cu_seqlens_q[i].item(),
-                self.max_seqlens_q)
-            self.max_seqlens_k = max(
-                cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item(),
-                self.max_seqlens_k)
-
-    def set_eight_bit_params(self, q_descale, k_descale, v_descale, p_scale,
-                             p_descale, o_scale):
-        self.eight_bit = True
-        self.q_descale = q_descale
-        self.k_descale = k_descale
-        self.v_descale = v_descale
-        self.p_scale = p_scale
-        self.p_descale = p_descale
-        self.o_scale = o_scale
-        self.use_p_scale = (p_scale is not None) and (
-            p_descale is not None) and (v_descale is not None)
-        self.eight_bit_kv = ((q_descale is None) and (k_descale is not None)
-                             and (v_descale is not None))
-        self.eight_bit_dtype_torch = default_eight_bit_dtype_torch
-
-    def need_bias(self, bias, seqlen_q, seqlen_k):
-        assert bias is not None
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.shape[0] == 1
-        assert bias.shape[2:] == (seqlen_q, seqlen_k)
-        self.bias = bias
-
-    def need_alibi(self, alibi_slopes, batch, nheads):
-        assert alibi_slopes.is_cuda
-        assert alibi_slopes.dim() == 2
-        assert alibi_slopes.shape[0] == batch
-        assert alibi_slopes.shape[1] == nheads
-        self.alibi_slopes = alibi_slopes
-
-    def need_causal(self):
-        self.causal = True
-
-    def check_args(self, q, k, v, o):
-        assert q.dim() == k.dim() and q.dim() == v.dim()
-
-        batch, nheads_q, nheads_k, head_size = get_shape_from_layout(
-            q, k, self)
-        if self.varlen:
-            assert q.dim() == 3
-            assert self.cu_seqlens_q is not None
-            assert self.cu_seqlens_k is not None
-            assert len(self.cu_seqlens_q) == len(self.cu_seqlens_k)
-            # TODO: Remove once bias is supported with varlen
-            assert self.bias is None
-            assert not self.return_encoded_softmax
-        else:
-            assert q.dim() == 4
-            assert self.max_seqlens_q > 0 and self.max_seqlens_k > 0
-            assert self.cu_seqlens_q is None and self.cu_seqlens_k is None
-        assert k.shape == v.shape
-        assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-        # TODO: Change assert if we support qkl f8 and v f16
-        if self.eight_bit:
-            if self.eight_bit_kv:
-                assert (v.dtype == k.dtype
-                        and k.dtype == self.eight_bit_dtype_torch)
-                assert q.dtype != k.dtype
-                assert (self.v_descale is not None) and (self.k_descale
-                                                         is not None)
-            else:
-                assert (q.dtype == k.dtype and q.dtype == v.dtype
-                        and q.dtype == self.eight_bit_dtype_torch)
-                assert (self.q_descale
-                        is not None) and (self.k_descale
-                                          is not None) and (self.v_descale
-                                                            is not None)
-                if self.use_p_scale:
-                    assert (self.p_scale is not None) and (self.p_descale
-                                                           is not None)
-        else:
-            assert (q.dtype == k.dtype) and (q.dtype == v.dtype)
-        assert head_size <= 256
-        assert o.shape == q.shape
-        assert (nheads_q % nheads_k) == 0
-        assert self.layout is not None
-        assert self.layout == 'thd' or not self.varlen
+torch_dtype: tl.constexpr = torch.float16
 
 
 @triton.jit
@@ -243,85 +41,40 @@ def max_fn(x, y):
     return tl.math.max(x, y)
 
 
-# Convenience function to load with optional boundary checks.
-# "First" is the major dim, "second" is the minor dim.
 @triton.jit
-def masked_load(ptrs, offset_first, offset_second, boundary_first,
-                boundary_second):
-    if offset_first is not None and offset_second is not None:
-        mask = (offset_first[:, None] < boundary_first) & \
-               (offset_second[None, :] < boundary_second)
-        tensor = tl.load(ptrs, mask=mask, other=0.0)
-    elif offset_first is not None:
-        mask = offset_first[:, None] < boundary_first
-        tensor = tl.load(ptrs, mask=mask, other=0.0)
-    elif offset_second is not None:
-        mask = offset_second[None, :] < boundary_second
-        tensor = tl.load(ptrs, mask=mask, other=0.0)
-    else:
-        tensor = tl.load(ptrs)
-    return tensor
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
 
 
 @triton.jit
-def compute_alibi_block(alibi_slope,
-                        seqlen_q,
-                        seqlen_k,
-                        offs_m,
-                        offs_n,
-                        transpose=False):
-    # when seqlen_k and seqlen_q are different we want the diagonal to stick to
-    # the bottom right of the attention matrix
-    # for casual mask we want something like this where (1 is kept and 0 is
-    # masked)
-    # seqlen_q = 2 and seqlen_k = 5
-    #   1 1 1 1 0
-    #   1 1 1 1 1
-    # seqlen_q = 5 and seqlen_k = 2
-    #        0 0
-    #        0 0
-    #        0 0
-    #        1 0
-    #        1 1
-    # for alibi the diagonal is 0 indicating no penalty for attending to that
-    # spot and increasing penalty for attending further from the diagonal
-    # e.g. alibi_slope = 1, seqlen_q = 2, seqlen_k = 5,
-    # offs_m = [0, 1, 2, 3], offs_n = [0, 1, 2, 3, 4], transpose = False
-    # 1. offs_m[:,None] = [[0],
-    #                       [1],
-    # 2. offs_m[:,None] + seqlen_k = [[5],
-    #                                  [6],
-    # 3. offs_m[:,None] + seqlen_k - seqlen_q = [[3],
-    #                                             [4],
-    # 4. offs_m[:,None] + seqlen_k - seqlen_q - offs_n[None,:] =
-    # [[3], - [[0, 1, 2, 3, 4]] =  [[ 3, 2, 1, 0,-1], [4], [ 4, 3, 2, 1, 0]]
-    # 5. -1 * alibi_slope * tl.abs(relative_pos_block) = [[ -3, -2, -1, 0,-1],
-    #                                                    [ -4, -3, -2, -1, 0]],
-    relative_pos_block = (offs_m[:, None] + seqlen_k - seqlen_q -
-                          offs_n[None, :])
-    alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block)
-    if transpose:
-        return alibi_block.T
-    else:
-        return alibi_block
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n,
+                                  stride).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
 
 
-def compute_alibi_tensor(alibi_slopes, seqlen_q, seqlen_k):
-    q_idx = torch.arange(seqlen_q, dtype=torch.int32,
-                         device="cuda").unsqueeze(-1)  # (N_CTX_Q, 1)
-    k_idx = torch.arange(seqlen_k, dtype=torch.int32,
-                         device="cuda").unsqueeze(0)  # (1, N_CTX_K)
-    relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q -
-                             k_idx)  # (N_CTX_Q, N_CTX_K)
-    return -1 * alibi_slopes.unsqueeze(-1).unsqueeze(
-        -1) * relative_pos  # (Z, H, N_CTX_Q, N_CTX_K)
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n,
+                             stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
 
 
 @triton.jit
-def quant_fp8(x, scale):
-    x *= scale
-    x = tl.clamp(x, FP8_MIN, FP8_MAX)
-    return x
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
 
 
 @triton.jit
@@ -330,68 +83,61 @@ def _attn_fwd_inner(
     l_i,
     m_i,
     q,
-    k_ptrs,
-    v_ptrs,
-    bias_ptrs,
-    stride_kn,
-    stride_vk,
-    stride_bn,
+    K_block_ptr,
+    V_block_ptr,
     start_m,
     actual_seqlen_k,
-    actual_seqlen_q,
+    dropout_p,
     philox_seed,
     batch_philox_offset,
-    encoded_sm_ptrs,
+    encoded_softmax_block_ptr,
     block_min,
     block_max,
     offs_n_causal,
     masked_blocks,
     n_extra_tokens,
-    alibi_slope,
-    q_descale,
-    k_descale,
-    v_descale,
-    p_scale,
+    bias_ptr,
     IS_CAUSAL: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
     OFFS_M: tl.constexpr,
     OFFS_N: tl.constexpr,
-    SHOULD_PRE_LOAD_V: tl.constexpr,
-    SHOULD_MASK_STEPS: tl.constexpr,
-    SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    USE_PADDED_HEAD: tl.constexpr,
-    IS_ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    QK_SCALE: tl.constexpr,
-    IS_EIGHT_BIT_GEMM: tl.constexpr,
-    USE_P_SCALE: tl.constexpr,
-    IS_EIGHT_BIT_KV: tl.constexpr,
-    QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    qk_scale,
+    p_descale,
 ):
-
     # loop over k, v, and update accumulator
     for start_n in range(block_min, block_max, BLOCK_N):
         # For padded blocks, we will overrun the tensor size if
         # we load all BLOCK_N. For others, the blocks are all within range.
-        k_offs_n = start_n + tl.arange(0,
-                                       BLOCK_N) if SHOULD_MASK_STEPS else None
-        k_offs_k = None if not USE_PADDED_HEAD else tl.arange(0, BLOCK_DMODEL)
-        k = masked_load(k_ptrs, k_offs_k, k_offs_n, IS_ACTUAL_BLOCK_DMODEL,
-                        actual_seqlen_k)
-        if SHOULD_PRE_LOAD_V:
-            # We can use the same offsets as k, just with dims transposed.
-            v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k,
-                            IS_ACTUAL_BLOCK_DMODEL)
+        k = load_fn(
+            K_block_ptr,
+            PADDED_HEAD,
+            MASK_STEPS and (n_extra_tokens != 0),
+            "zero",
+        )
+        if PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         # We start from end of seqlen_k so only the first iteration would need
         # to be checked for padding if it is not a multiple of block_n
         # TODO: This can be optimized to only be true for the padded block.
-        if SHOULD_MASK_STEPS:  # noqa: SIM102
+        if MASK_STEPS:  # noqa: SIM102
             # If this is the last block / iteration, we want to
             # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not
-            # is_modulo_mn. last step might get wasted but that is okay.
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
+            # if not is_modulo_mn. last step might get wasted but that is okay.
             # check if this masking works for that case.
             if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
                 boundary_m = tl.full([BLOCK_M],
@@ -404,107 +150,112 @@ def _attn_fwd_inner(
             causal_boundary = start_n + offs_n_causal
             causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
             qk = tl.where(causal_mask, qk, float("-inf"))
-
         # -- compute qk ----
-        if IS_EIGHT_BIT_GEMM:
-            qk += ((((tl.dot(q, k).to(tl.float32) * q_descale)) * k_descale) *
-                   QK_SCALE)
-        else:
-            if IS_EIGHT_BIT_KV:
-                k = (k * k_descale).to(q.type.element_ty)
-            qk += (tl.dot(q, k) * QK_SCALE)
-
-        if bias_ptrs is not None:
-            bias_offs_n = start_n + tl.arange(
-                0, BLOCK_N) if SHOULD_MASK_STEPS else None
-            bias = masked_load(bias_ptrs, OFFS_M, bias_offs_n, actual_seqlen_q,
-                               actual_seqlen_k)
-            # While bias is added after multiplying qk with sm_scale,
-            # our optimization to use 2^x instead of e^x results in an
-            # additional scale factor of log2(e) which we must also multiply
-            # the bias with.
-            qk += (bias * 1.44269504089)
-
-        if alibi_slope is not None:
-            # Compute the global position of each token within the sequence
-            global_m_positions = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-            global_n_positions = start_n + tl.arange(0, BLOCK_N)
-            alibi_block = compute_alibi_block(alibi_slope, actual_seqlen_q,
-                                              actual_seqlen_k,
-                                              global_m_positions,
-                                              global_n_positions)
-            qk += (alibi_block * 1.44269504089)  # scale factor of log2(e)
-
-        # softmax
+        qk += tl.dot(q, k)
+        if USE_FP8:
+            qk *= qk_scale
+        if bias_ptr is not None:
+            bias = load_fn(bias_ptr, False, MASK_STEPS
+                           and (n_extra_tokens != 0), "zero")
+            # While bias is added after multiplying qk with sm_scale, our
+            # optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += bias * 1.44269504089
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk = qk - m_ij[:, None]
         p = tl.math.exp2(qk)
 
         # CAVEAT: Must update l_ij before applying dropout
         l_ij = tl.sum(p, 1)
-        if SHOULD_RETURN_ENCODED_SOFTMAX:
-            tl.store(encoded_sm_ptrs, p.to(encoded_sm_ptrs.type.element_ty))
+        if ENABLE_DROPOUT:
+            philox_offset = (batch_philox_offset +
+                             start_m * BLOCK_M * actual_seqlen_k + start_n -
+                             BLOCK_N)
+            keep = dropout_mask(
+                philox_seed,
+                philox_offset,
+                dropout_p,
+                BLOCK_M,
+                BLOCK_N,
+                actual_seqlen_k,
+            )
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(
+                    encoded_softmax_block_ptr,
+                    tl.where(keep, p,
+                             -p).to(encoded_softmax_block_ptr.type.element_ty),
+                )
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(
+                encoded_softmax_block_ptr,
+                p.to(encoded_softmax_block_ptr.type.element_ty),
+            )
         # -- update output accumulator --
         alpha = tl.math.exp2(m_i - m_ij)
         acc = acc * alpha[:, None]
-        if not SHOULD_PRE_LOAD_V:
-            v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k,
-                            IS_ACTUAL_BLOCK_DMODEL)
+        if not PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
         # -- update m_i and l_i
         l_i = l_i * alpha + l_ij
         # update m_i and l_i
         m_i = m_ij
 
-        if IS_EIGHT_BIT_GEMM:
-            if USE_P_SCALE:
-                p = quant_fp8(p, p_scale).to(QUANT_DTYPE)
-                acc += tl.dot(p, v)
-            else:
-                # v is in eight_bit but p is not, we want the gemm in p's type
-                acc += tl.dot(p, v.to(p.type.element_ty))
-        else:
-            if IS_EIGHT_BIT_KV:
-                v = (v * v_descale).to(p.type.element_ty)
-            acc += tl.dot(p.to(v.type.element_ty), v)
-
-        k_ptrs += BLOCK_N * stride_kn
-        v_ptrs += BLOCK_N * stride_vk
-        if bias_ptrs is not None:
-            bias_ptrs += BLOCK_N * stride_bn
-        if SHOULD_RETURN_ENCODED_SOFTMAX:
-            encoded_sm_ptrs += BLOCK_N
+        if USE_FP8:
+            p *= p_descale
+
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, BLOCK_N))
     return acc, l_i, m_i
 
 
 def get_cdna_autotune_configs():
     return [
+        triton.Config(
+            {
+                'BLOCK_M': 256,
+                'BLOCK_N': 64,
+                'waves_per_eu': 2,
+                'PRE_LOAD_V': False
+            },
+            num_stages=1,
+            num_warps=8),
         triton.Config(
             {
                 'BLOCK_M': 128,
                 'BLOCK_N': 128,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=4),
         triton.Config(
             {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
+                'BLOCK_M': 256,
+                'BLOCK_N': 128,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
-            num_warps=4),
+            num_warps=8),
         triton.Config(
             {
                 'BLOCK_M': 128,
                 'BLOCK_N': 64,
-                'waves_per_eu': 3,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'waves_per_eu': 1,
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=4),
@@ -512,168 +263,141 @@ def get_cdna_autotune_configs():
             {
                 'BLOCK_M': 128,
                 'BLOCK_N': 64,
-                'waves_per_eu': 1,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'waves_per_eu': 3,
+                'PRE_LOAD_V': True
             },
             num_stages=1,
             num_warps=4),
         triton.Config(
             {
                 'BLOCK_M': 128,
-                'BLOCK_N': 32,
-                'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'BLOCK_N': 64,
+                'waves_per_eu': 3,
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=4),
-    ], [
-        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
-        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
-    ]
-
-
-def get_rdna_autotune_configs():
-    return [
         triton.Config(
             {
-                'BLOCK_M': 32,
-                'BLOCK_N': 32,
+                'BLOCK_M': 64,
+                'BLOCK_N': 64,
                 'waves_per_eu': 4,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
-            num_warps=2),
+            num_warps=8),
         triton.Config(
             {
                 'BLOCK_M': 32,
                 'BLOCK_N': 32,
-                'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'waves_per_eu': 4,
+                'PRE_LOAD_V': False
             },
             num_stages=1,
-            num_warps=2),
+            num_warps=8),
+        # TODO: This config fails with head_size not pow2 with data mismatches.
+        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
+        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+
+        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
+        # triton.Config(
+        #     {
+        #         "BLOCK_M": 16,
+        #         "BLOCK_N": 16,
+        #         "waves_per_eu": 1,
+        #         "PRE_LOAD_V": False,
+        #     },
+        #     num_stages=1,
+        #     num_warps=4,
+        # ),
+    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
+
+
+def get_rdna_autotune_configs():
+    return [
         triton.Config(
             {
                 'BLOCK_M': 32,
-                'BLOCK_N': 16,
+                'BLOCK_N': 32,
                 'waves_per_eu': 4,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
         triton.Config(
             {
                 'BLOCK_M': 32,
-                'BLOCK_N': 16,
+                'BLOCK_N': 32,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
         triton.Config(
             {
-                'BLOCK_M': 16,
+                'BLOCK_M': 32,
                 'BLOCK_N': 16,
                 'waves_per_eu': 4,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
         triton.Config(
             {
-                'BLOCK_M': 16,
+                'BLOCK_M': 32,
                 'BLOCK_N': 16,
                 'waves_per_eu': 2,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=2),
-        # Fall-back config.
-        triton.Config(
-            {
-                'BLOCK_M': 16,
-                'BLOCK_N': 16,
-                'waves_per_eu': 1,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
+                'PRE_LOAD_V': False
             },
             num_stages=1,
             num_warps=2),
-    ], [
-        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
-        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
-    ]
-
-
-def get_general_autotune_configs():
-    return [
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 128,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 64,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=4),
-        triton.Config(
-            {
-                'BLOCK_M': 128,
-                'BLOCK_N': 32,
-                'SHOULD_PRE_LOAD_V': False,
-                'GRID_CU_MULTIP': 2
-            },
-            num_stages=1,
-            num_warps=4),
-    ], [
-        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
-        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
-    ]
-
-
-def has_cdna_target():
-    ROCM_CDNA_TARGETS = ["gfx942", "gfx90a", "gfx908"]
-    return triton.runtime.driver.active.get_current_target(
-    ).arch in ROCM_CDNA_TARGETS
-
-
-def is_rocm_cdna():
-    return current_platform.is_rocm() and has_cdna_target()
+        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 4,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 2,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+        # # Fall-back config.
+        # triton.Config(
+        #     {
+        #         'BLOCK_M': 16,
+        #         'BLOCK_N': 16,
+        #         'waves_per_eu': 1,
+        #         'PRE_LOAD_V': False
+        #     },
+        #     num_stages=1,
+        #     num_warps=2),
+    ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8']
 
 
 def get_autotune_configs():
-    if is_rocm_cdna():
-        return get_cdna_autotune_configs()
-    elif current_platform.is_rocm():
+    if on_gfx1x():
         return get_rdna_autotune_configs()
     else:
-        return get_general_autotune_configs()
+        return get_cdna_autotune_configs()
 
 
 autotune_configs, autotune_keys = get_autotune_configs()
 
+float8_info = torch.finfo(current_platform.fp8_dtype())
+
 
 @triton.autotune(
     configs=autotune_configs,
     key=autotune_keys,
-    use_cuda_graph=True,
 )
 @triton.jit
 def attn_fwd(
@@ -681,7 +405,13 @@ def attn_fwd(
     K,
     V,
     bias,
-    SM_SCALE: tl.constexpr,
+    sm_scale,
+    q_scale,
+    k_scale,
+    v_scale,
+    p_scale,
+    p_descale,
+    o_descale,
     L,
     Out,
     stride_qz: tl.int64,
@@ -704,70 +434,44 @@ def attn_fwd(
     stride_bh: tl.int64,
     stride_bm: tl.int64,
     stride_bn: tl.int64,
-    stride_az: tl.int64,
-    stride_ah: tl.int64,
-    q_descale_ptr,
-    k_descale_ptr,
-    p_scale_ptr,
-    p_descale_ptr,
-    o_descale_ptr,
-    v_descale_ptr,
-    q_descale_has_singleton: tl.constexpr,
-    k_descale_has_singleton: tl.constexpr,
-    p_descale_has_singleton: tl.constexpr,
-    v_descale_has_singleton: tl.constexpr,
     cu_seqlens_q,
     cu_seqlens_k,
+    dropout_p,
     philox_seed,
-    NUM_CU: tl.constexpr,
-    GRID_CU_MULTIP: tl.constexpr,
-    B: tl.constexpr,
     philox_offset_base,
     encoded_softmax,
-    alibi_slopes,
     HQ: tl.constexpr,
     HK: tl.constexpr,
-    IS_ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
     MAX_SEQLENS_Q: tl.constexpr,
     MAX_SEQLENS_K: tl.constexpr,
     VARLEN: tl.constexpr,
     IS_CAUSAL: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    USE_FP8_OUT: tl.constexpr,
     BLOCK_N: tl.constexpr,
-    SHOULD_PRE_LOAD_V: tl.constexpr,
-    USE_BIAS: tl.constexpr,
-    SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    USE_ALIBI: tl.constexpr,
-    IS_EIGHT_BIT: tl.constexpr,
-    USE_P_SCALE: tl.constexpr,
-    IS_EIGHT_BIT_KV: tl.constexpr,
-    QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
 ):
-
-    if o_descale_ptr is not None:
-        o_descale = tl.load(o_descale_ptr)
-
-    start_m: tl.int64 = tl.program_id(0)
-    off_h_q: tl.int64 = tl.program_id(1)
-    off_z: tl.int64 = tl.program_id(2)
-
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M).to(tl.int64)
-    offs_n = tl.arange(0, BLOCK_N).to(tl.int64)
-    offs_d = tl.arange(0, BLOCK_DMODEL).to(tl.int64)
-
-    # as we can't have return statements inside while loop in Triton
-    continue_condition = True
-
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
     if VARLEN:
         cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
         cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
         seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be
-        # too small for all start_m so for those we return early.
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
         if start_m * BLOCK_M > seqlen_q:
-            continue_condition = False
-            # return
+            return
         cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
         cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
         seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
@@ -777,598 +481,499 @@ def attn_fwd(
         seqlen_q = MAX_SEQLENS_Q
         seqlen_k = MAX_SEQLENS_K
 
-    if continue_condition:
-        # Now we compute whether we need to exit early due to causal
-        # masking. This is because for seqlen_q > seqlen_k, M rows of the
-        # attn scores are completely masked, resulting in 0s written to the
-        # output, and inf written to LSE. We don't need to do any GEMMs in
-        # this case. This block of code determines what N is, and if this
-        # WG is operating on those M rows.
-        n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-        if (IS_CAUSAL):
-            # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-            # If seqlen_q != seqlen_k, attn scores are rectangular which
-            # means the causal mask boundary is bottom right aligned, and
-            # ends at either the top edge (seqlen_q < seqlen_k) or left
-            # edge. This captures the decrease in n_blocks if we have a
-            # rectangular attn matrix
-            n_blocks_seqlen = cdiv_fn(
-                (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
-            # This is what adjusts the block_max for the current WG, only
-            # if IS_CAUSAL. Otherwise we want to always iterate through all
-            # n_blocks
-            n_blocks = min(n_blocks, n_blocks_seqlen)
-            # If we have no blocks after adjusting for seqlen deltas, this
-            # WG is part of the blocks that are all 0. We exit early.
-            if n_blocks <= 0:
-                o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh +
-                            cu_seqlens_q_start * stride_om)
-                o_ptrs = (o_offset + offs_m[:, None] * stride_om +
-                          offs_d[None, :] * stride_on)
-                acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-                o_ptrs_mask = (offs_m[:, None] < seqlen_q).broadcast_to(
-                    [BLOCK_M, BLOCK_DMODEL])
-                # We still need to write 0s to the result
-                tl.store(o_ptrs, acc, mask=o_ptrs_mask)
-                # The tensor allocated for L is based on MAX_SEQLENS_Q as
-                # that is statically known.
-                l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q +
-                          off_h_q * MAX_SEQLENS_Q + offs_m)
-                # We store inf to LSE, not -inf because in the bwd pass,
-                # we subtract this from qk which makes it -inf, such that
-                # exp(qk - inf) = 0 for these masked blocks.
-                l_value = tl.full([BLOCK_M],
-                                  value=float("inf"),
-                                  dtype=tl.float32)
-                l_ptrs_mask = offs_m < MAX_SEQLENS_Q
-                tl.store(l_ptrs, l_value, mask=l_ptrs_mask)
-                # TODO: Should dropout and return encoded softmax be
-                # handled here too?
-                continue_condition = False
-                # return
-
-        if continue_condition:
-            # If MQA / GQA, set the K and V head offsets appropriately.
-            GROUP_SIZE: tl.constexpr = HQ // HK
-            off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
-            n_extra_tokens = 0
-            if seqlen_k < BLOCK_N:
-                n_extra_tokens = BLOCK_N - seqlen_k
-            elif seqlen_k % BLOCK_N:
-                n_extra_tokens = seqlen_k % BLOCK_N
-            USE_PADDED_HEAD: tl.constexpr = (IS_ACTUAL_BLOCK_DMODEL
-                                             != BLOCK_DMODEL)
-
-            # Compute pointers for all the tensors used in this kernel.
-            q_offset = (Q + off_z * stride_qz + off_h_q * stride_qh +
-                        cu_seqlens_q_start * stride_qm)
-            q_ptrs = (q_offset + offs_m[:, None] * stride_qm +
-                      offs_d[None, :] * stride_qk)
-            k_offset = (K + off_z * stride_kz + off_h_k * stride_kh +
-                        cu_seqlens_k_start * stride_kn)
-            k_ptrs = (k_offset + offs_d[:, None] * stride_kk +
-                      offs_n[None, :] * stride_kn)
-            v_offset = (V + off_z * stride_vz + off_h_k * stride_vh +
-                        cu_seqlens_k_start * stride_vk)
-            v_ptrs = (v_offset + offs_n[:, None] * stride_vk +
-                      offs_d[None, :] * stride_vn)
-            # Compute pointers for all scale tensors used in this kernel.
-
-            IS_EIGHT_BIT_GEMM: tl.constexpr = IS_EIGHT_BIT & (
-                not IS_EIGHT_BIT_KV)
-            if IS_EIGHT_BIT:
-                if k_descale_has_singleton:
-                    k_descale_ptrs = k_descale_ptr
-                else:
-                    k_descale_ptrs = k_descale_ptr + off_h_k
-
-                if v_descale_has_singleton:
-                    v_descale_ptrs = v_descale_ptr
-                else:
-                    v_descale_ptrs = v_descale_ptr + off_h_k
-
-                if not IS_EIGHT_BIT_KV:
-                    if q_descale_has_singleton:
-                        q_descale_ptrs = q_descale_ptr
-                    else:
-                        q_descale_ptrs = q_descale_ptr + off_h_q
-                if USE_P_SCALE:
-                    if p_descale_has_singleton:
-                        p_scale_ptrs = p_scale_ptr
-                        p_descale_ptrs = p_descale_ptr
-                    else:
-                        p_scale_ptrs = p_scale_ptr + off_h_q
-                        p_descale_ptrs = p_descale_ptr + off_h_q
-
-            if USE_BIAS:
-                bias_offset = off_h_q * stride_bh
-                bias_ptrs = (bias + bias_offset + offs_m[:, None] * stride_bm +
-                             offs_n[None, :] * stride_bn)
-            else:
-                bias_ptrs = None
-
-            if USE_ALIBI:
-                a_offset = off_z * stride_az + off_h_q * stride_ah
-                alibi_slope = tl.load(alibi_slopes + a_offset)
-            else:
-                alibi_slope = None
-
-            batch_philox_offset = 0
-            # We can ask to return the dropout mask without doing any
-            # dropout. In this case, we return an invalid pointer so
-            # indicate the mask is not valid.
-            if SHOULD_RETURN_ENCODED_SOFTMAX:
-                encoded_sm_base = (encoded_softmax +
-                                   off_h_q * seqlen_q * seqlen_k)
-                encoded_sm_ptrs = (encoded_sm_base +
-                                   offs_m[:, None] * seqlen_k +
-                                   offs_n[None, :])
-            else:
-                encoded_sm_ptrs = None
-            # initialize pointer to m and l
-            m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-            l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-            # scale sm_scale by log_2(e) and use 2^x in the loop as we do
-            # not have native e^x support in HW.
-            QK_SCALE: tl.constexpr = SM_SCALE * 1.44269504089
-            # Q is loaded once at the beginning and shared by all N blocks.
-            q_ptrs_mask = offs_m[:, None] < seqlen_q
-            if USE_PADDED_HEAD:
-                q_ptrs_mask = q_ptrs_mask & (offs_d[None, :]
-                                             < IS_ACTUAL_BLOCK_DMODEL)
-            q = tl.load(q_ptrs, mask=q_ptrs_mask, other=0.0)
-
-            if IS_EIGHT_BIT:
-                k_descale = tl.load(k_descale_ptrs)
-                v_descale = tl.load(v_descale_ptrs)
-                q_descale = None if IS_EIGHT_BIT_KV else tl.load(
-                    q_descale_ptrs)
-                if USE_P_SCALE:
-                    p_scale = tl.load(p_scale_ptrs)
-                    p_descale = tl.load(p_descale_ptrs)
-                else:
-                    p_scale = None
-                    p_descale = None
-            else:
-                q_descale = None
-                k_descale = None
-                v_descale = None
-                p_scale = None
-                p_descale = None
-            # Here we compute how many full and masked blocks we have.
-            padded_block_k = n_extra_tokens != 0
-            is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-            if IS_CAUSAL:
-                # There are always at least BLOCK_M // BLOCK_N masked
-                # blocks.  Additionally there might be one more due to
-                # dissimilar seqlens.
-                masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-            else:
-                # Padding on Q does not need to be masked in the FA loop.
-                masked_blocks = padded_block_k
-            # if IS_CAUSAL, not is_modulo_mn does not always result in an
-            # additional block. In this case we might exceed n_blocks so
-            # pick the min.
-            masked_blocks = min(masked_blocks, n_blocks)
-            n_full_blocks = n_blocks - masked_blocks
-            block_min = 0
-            block_max = n_blocks * BLOCK_N
-            # Compute for full blocks. Here we set causal to false
-            # regardless of its actual value because there is no masking.
-            # Similarly we do not need padding.
-            if n_full_blocks > 0:
-                block_max = (n_blocks - masked_blocks) * BLOCK_N
-                acc, l_i, m_i = _attn_fwd_inner(
-                    acc,
-                    l_i,
-                    m_i,
-                    q,
-                    k_ptrs,
-                    v_ptrs,
-                    bias_ptrs,
-                    stride_kn,
-                    stride_vk,
-                    stride_bn,
-                    start_m,
-                    seqlen_k,
-                    seqlen_q,
-                    philox_seed,
-                    batch_philox_offset,
-                    encoded_sm_ptrs,
-                    # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-                    block_min,
-                    block_max,
-                    0,
-                    0,
-                    0,
-                    alibi_slope,
-                    q_descale,
-                    k_descale,
-                    v_descale,
-                    p_scale,
-                    # IS_CAUSAL, ....
-                    False,
-                    BLOCK_M,
-                    BLOCK_DMODEL,
-                    BLOCK_N,
-                    offs_m,
-                    offs_n,
-                    # _, SHOULD_MASK_STEPS, ...
-                    SHOULD_PRE_LOAD_V,
-                    False,
-                    SHOULD_RETURN_ENCODED_SOFTMAX,
-                    USE_PADDED_HEAD,
-                    IS_ACTUAL_BLOCK_DMODEL,
-                    QK_SCALE,
-                    IS_EIGHT_BIT_GEMM,
-                    USE_P_SCALE,
-                    IS_EIGHT_BIT_KV,
-                    QUANT_DTYPE)
-                block_min = block_max
-                block_max = n_blocks * BLOCK_N
-
-            tl.debug_barrier()
-            # Remaining blocks, if any, are full / not masked.
-            if (masked_blocks > 0):
-                if IS_CAUSAL:
-                    offs_n_causal = offs_n + (seqlen_q - seqlen_k)
-                else:
-                    offs_n_causal = 0
-                k_ptrs += n_full_blocks * BLOCK_N * stride_kn
-                v_ptrs += n_full_blocks * BLOCK_N * stride_vk
-                if USE_BIAS:
-                    bias_ptrs += n_full_blocks * BLOCK_N * stride_bn
-                if SHOULD_RETURN_ENCODED_SOFTMAX:
-                    encoded_sm_ptrs += n_full_blocks * BLOCK_N
-                acc, l_i, m_i = _attn_fwd_inner(
-                    acc,
-                    l_i,
-                    m_i,
-                    q,
-                    k_ptrs,
-                    v_ptrs,
-                    bias_ptrs,
-                    stride_kn,
-                    stride_vk,
-                    stride_bn,
-                    start_m,
-                    seqlen_k,
-                    seqlen_q,
-                    philox_seed,
-                    batch_philox_offset,
-                    encoded_sm_ptrs,
-                    block_min,
-                    block_max,
-                    offs_n_causal,
-                    masked_blocks,
-                    n_extra_tokens,
-                    alibi_slope,
-                    q_descale,
-                    k_descale,
-                    v_descale,
-                    p_scale,
-                    IS_CAUSAL,
-                    BLOCK_M,
-                    BLOCK_DMODEL,
-                    BLOCK_N,
-                    offs_m,
-                    offs_n,
-                    # _, SHOULD_MASK_STEPS, ...
-                    SHOULD_PRE_LOAD_V,
-                    True,
-                    SHOULD_RETURN_ENCODED_SOFTMAX,
-                    USE_PADDED_HEAD,
-                    IS_ACTUAL_BLOCK_DMODEL,
-                    QK_SCALE,
-                    IS_EIGHT_BIT_GEMM,
-                    USE_P_SCALE,
-                    IS_EIGHT_BIT_KV,
-                    QUANT_DTYPE)
-
-            if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV:
-                if USE_P_SCALE:
-                    acc *= p_descale
-                acc *= v_descale
-
-            # epilogue
-            # This helps the compiler do Newton Raphson on l_i vs on acc
-            # which is much larger.
-            l_recip = 1 / l_i[:, None]
-            acc = acc * l_recip
-
-            # If seqlen_q > seqlen_k but the delta is not a multiple of
-            # BLOCK_M, then we have one block with a row of all NaNs which
-            # come from computing softmax over a row of all
-            # -infs (-inf - inf = NaN). We check for that here and store 0s
-            # where there are NaNs as these rows should've been zeroed out.
-            end_m_idx = (start_m + 1) * BLOCK_M
-            start_m_idx = start_m * BLOCK_M
-            causal_start_idx = seqlen_q - seqlen_k
-            if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV:  # noqa: SIM102
-                if o_descale_ptr is not None:
-                    acc = quant_fp8(acc, o_descale)
-
-            acc = acc.to(Out.type.element_ty)
-            if IS_CAUSAL:  # noqa: SIM102
-                if (causal_start_idx > start_m_idx
-                        and causal_start_idx < end_m_idx):
-                    out_mask_boundary = tl.full((BLOCK_DMODEL, ),
-                                                causal_start_idx,
-                                                dtype=tl.int32)
-                    mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-                    out_ptrs_mask = (mask_m_offsets[:, None]
-                                     >= out_mask_boundary[None, :])
-                    z = tl.zeros((1, ), tl.float32)
-                    acc = tl.where(out_ptrs_mask, acc,
-                                   z.to(acc.type.element_ty))
-            # write back LSE
-            l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q +
-                      off_h_q * MAX_SEQLENS_Q + offs_m)
-            # If seqlen_q not multiple of BLOCK_M, we need to mask out the
-            # last few rows. This is only true for the last M block.
-            # For others, overflow_size will be -ve
-            overflow_size = end_m_idx - seqlen_q
-            if overflow_size > 0:
-                boundary = tl.full((BLOCK_M, ),
-                                   BLOCK_M - overflow_size,
-                                   dtype=tl.int32)
-                l_ptrs_mask = tl.arange(0, BLOCK_M) < boundary
-                tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-            else:
-                tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-            # write back O
-            o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh +
-                        cu_seqlens_q_start * stride_om)
-            o_ptrs = (o_offset + offs_m[:, None] * stride_om +
-                      offs_d[None, :] * stride_on)
-            o_ptrs_mask = tl.full([BLOCK_M, BLOCK_DMODEL], 1, dtype=tl.int1)
-            if overflow_size > 0:
-                o_ptrs_mask = o_ptrs_mask & (offs_m[:, None] < seqlen_q)
-            if USE_PADDED_HEAD:
-                o_ptrs_mask = o_ptrs_mask & (offs_d[None, :]
-                                             < IS_ACTUAL_BLOCK_DMODEL)
-            tl.store(o_ptrs, acc.to(Out.dtype.element_ty), mask=o_ptrs_mask)
-
-
-def get_shape_from_layout(q, k, metadata):
-    assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    if metadata.layout == 'thd':
-        nheads_q, nheads_k = q.shape[1], k.shape[1]
-        head_size = q.shape[-1]
-        batch = metadata.num_contexts
-    elif metadata.layout == 'bhsd':
-        batch, nheads_q, _, head_size = q.shape
-        nheads_k = k.shape[1]
-    elif metadata.layout == 'bshd':
-        batch, _, nheads_q, head_size = q.shape
-        nheads_k = k.shape[2]
-    return batch, nheads_q, nheads_k, head_size
-
-
-def get_strides_from_layout(q, k, v, o, metadata):
-    assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    STRIDE_PERMUTATIONS = {
-        'thd': (None, 1, 0, 2),
-        'bhsd': (0, 1, 2, 3),
-        'bshd': (0, 2, 1, 3),
-    }
-
-    perm = STRIDE_PERMUTATIONS[metadata.layout]
-    stride = lambda x, p: (0 if p is None else x.stride(p))
-    strides = lambda x: (stride(x, p) for p in perm)
-
-    return tuple(strides(x) for x in [q, k, v, o])
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if IS_CAUSAL:
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn
+        # matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is
+        # part of the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                        off_h_q * stride_oh)
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0),
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            # tl.store(O_block_ptr,
+            # acc.to(Out.type.element_ty), boundary_check=(0,1))
+            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
+            #          + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass,
+            # we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0
+            # for these masked blocks.
+            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            # tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here?
+            return
+
+    # If MQA / GQA, set the K and V head offsets appropriately.
+    GROUP_SIZE: tl.constexpr = HQ // HK
+    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
+
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        n_extra_tokens = seqlen_k % BLOCK_N
+    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = (off_z * stride_qz + off_h_q * stride_qh +
+                cu_seqlens_q_start * stride_qm)
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    k_offset = (off_z * stride_kz + off_h_k * stride_kh +
+                cu_seqlens_k_start * stride_kn)
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    v_offset = (off_z * stride_vz + off_h_k * stride_vh +
+                cu_seqlens_k_start * stride_vk)
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = philox_offset_base \
+                              + (off_z * HQ + off_h_q) \
+                              * seqlen_q * seqlen_k
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout.
+    # In this case, we return an invalid pointer so indicate the mask is not i
+    # valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+            shape=(seqlen_q, seqlen_k),
+            strides=(seqlen_k, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, padded_head, "zero")
+    if not USE_FP8:
+        q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+        acc_scale = 1.0
+    else:
+        qk_scale *= q_scale * k_scale
+        acc_scale = p_scale * v_scale
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
+    # block. In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min,
+            block_max,
+            0,
+            0,
+            0,
+            bias_ptr,
+            # IS_CAUSAL, ....
+            False,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            False,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+            USE_FP8,
+            qk_scale,
+            p_descale,
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if masked_blocks > 0:
+        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, n_full_blocks))
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            block_min,
+            block_max,
+            offs_n_causal,
+            masked_blocks,
+            n_extra_tokens,
+            bias_ptr,
+            IS_CAUSAL,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            True,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+            USE_FP8,
+            qk_scale,
+            p_descale,
+        )
+    # epilogue
+
+    if USE_FP8:
+        acc *= acc_scale
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    if USE_FP8_OUT:
+        acc *= o_descale
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:  # noqa: SIM102
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full((BLOCK_DMODEL, ),
+                                        causal_start_idx,
+                                        dtype=tl.int32)
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
+            z = tl.zeros((1, ), tl.float32)
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
+    # few rows. This is only true for the last M block. For others,
+    # overflow_size will be -ve
+    # overflow_size = end_m_idx - seqlen_q
+    # if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    # else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                off_h_q * stride_oh)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # Need boundary check on this to make sure the padding from the
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally.
+    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
+
+
+def check_args(
+    q,
+    k,
+    v,
+    o,
+    varlen=True,
+    max_seqlens=None,
+    cu_seqlens_q=None,
+    cu_seqlens_k=None,
+):
+    assert q.dim() == k.dim() and q.dim() == v.dim()
+    if varlen:
+        assert q.dim() == 3
+        total_q, nheads_q, head_size = q.shape
+        total_k, nheads_k, _ = k.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+    else:
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+    assert k.shape == v.shape
+    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+    # TODO: Change assert if we support qkl f8 and v f16
+    assert q.dtype == k.dtype and q.dtype == v.dtype
+    assert head_size <= 256
+    assert o.shape == q.shape
+    assert (nheads_q % nheads_k) == 0
 
 
 class _attention(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, q, k, v, o, metadata: MetaData):
-        # NOTE: a large bias tensor leads to overflow during pointer arithmetic
-        if (metadata.bias is not None):
-            assert (metadata.bias.numel() < 2**31)
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        o,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlens_q,
+        max_seqlens_k,
+        causal=False,
+        sm_scale=1.0,
+        bias=None,
+        fp8_scales=None,
+        fp8_out_scale=None,
+    ):
+        if fp8_scales is not None:
+            use_fp8 = True
+            (q_scale, k_scale, v_scale, p_scale) = fp8_scales
+            float8 = current_platform.fp8_dtype()
+
+            def check_and_convert(t, scale):
+                if t.dtype != float8:
+                    descale = 1.0 / scale
+                    ts = (t * descale).clamp(min=float8_info.min,
+                                             max=float8_info.max)
+                    return ts.to(float8)
+                else:
+                    return t
 
-        if o is None:
-            if metadata.eight_bit:
-                o = torch.empty_like(
-                    q,
-                    dtype=metadata.output_dtype if metadata.output_dtype
-                    is not None else metadata.eight_bit_dtype_torch)
-            else:
-                o = torch.empty_like(q, dtype=q.dtype)
+            q = check_and_convert(q, q_scale)
+            k = check_and_convert(k, k_scale)
+            v = check_and_convert(v, v_scale)
+        else:
+            use_fp8 = False
+            q_scale = k_scale = v_scale = p_scale = 1.0
 
-        metadata.check_args(q, k, v, o)
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
 
-        batch, nheads_q, nheads_k, head_size = get_shape_from_layout(
-            q, k, metadata)
-        q_strides, k_strides, v_strides, o_strides = get_strides_from_layout(
-            q, k, v, o, metadata)
+        check_args(
+            q,
+            k,
+            v,
+            o,
+            varlen=True,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+        )
+        if True:  # varlen
+            total_q, nheads_q, head_size = q.shape
+            total_k, nheads_k, _ = k.shape
+            batch = len(cu_seqlens_q) - 1
+            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
+            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
+            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
+            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
+        else:
+            batch, seqlen_q, nheads_q, head_size = q.shape
+            _, seqlen_k, nheads_k, _ = k.shape
+            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
 
         # Get closest power of 2 over or equal to 32.
-        padded_d_model = 1 << (head_size - 1).bit_length()
-        # Smallest head_dim supported is 16. If smaller, the tile in the
-        # kernel is padded - there is no padding in memory for any dims.
-        padded_d_model = max(padded_d_model, 16)
-
-        # encoded_softmax is used to validate dropout behavior vs the
-        # PyTorch SDPA math backend reference.  We zero this out to give a
-        # consistent starting point and then populate it with the output of
-        # softmax with the sign bit set according to the dropout mask.
-        # The resulting return allows this mask to be fed into the reference
-        # implementation for testing only.  This return holds no useful output
-        # aside from debugging.
-        if metadata.return_encoded_softmax:
-            encoded_softmax = torch.zeros(
-                (q.shape[0], q.shape[1], q.shape[2], k.shape[2]),
-                device=q.device,
-                dtype=torch.float32)
+        unpadded_head_dims = {32, 64, 128, 256}
+        if head_size not in unpadded_head_dims:
+            padded_d_model = None
+            for i in unpadded_head_dims:
+                if i > head_size:
+                    padded_d_model = i
+                    break
+            assert padded_d_model is not None
         else:
-            encoded_softmax = None
+            padded_d_model = head_size
+
+        grid = lambda META: (
+            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
+            nheads_q,
+            batch,
+        )
 
-        M = torch.empty((batch, nheads_q, metadata.max_seqlens_q),
-                        device=q.device,
-                        dtype=torch.float32)
+        encoded_softmax = None
 
         # Seed the RNG so we get reproducible results for testing.
         philox_seed = 0x1BF52
         philox_offset = 0x1D4B42
 
-        if metadata.bias is not None:
-            bias_strides = (metadata.bias.stride(0), metadata.bias.stride(1),
-                            metadata.bias.stride(2), metadata.bias.stride(3))
+        if bias is not None:
+            bias_strides = (
+                bias.stride(0),
+                bias.stride(1),
+                bias.stride(2),
+                bias.stride(3),
+            )
         else:
             bias_strides = (0, 0, 0, 0)
 
-        if metadata.alibi_slopes is not None:
-            alibi_strides = (metadata.alibi_slopes.stride(0),
-                             metadata.alibi_slopes.stride(1))
-        else:
-            alibi_strides = (0, 0)
+        p_descale = 1.0 / p_scale
+        o_descale = 1.0 / fp8_out_scale.item(
+        ) if fp8_out_scale is not None else 1.0
 
-        if metadata.eight_bit:
-            q_descale, k_descale, p_scale, p_descale, v_descale, o_scale = (
-                metadata.q_descale, metadata.k_descale, metadata.p_scale,
-                metadata.p_descale, metadata.v_descale, metadata.o_scale)
-            o_descale = 1.0 / o_scale if o_scale is not None else None
-        else:
-            q_descale = k_descale = p_scale = None
-            p_descale = v_descale = o_descale = None
-
-        # number of compute units available
-        NUM_CU = torch.cuda.get_device_properties("cuda").multi_processor_count
-
-        grid = lambda META: (triton.cdiv(metadata.max_seqlens_q, META[
-            'BLOCK_M']), nheads_q, batch)
+        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
+        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
 
         attn_fwd[grid](
             q,
             k,
             v,
-            metadata.bias,
-            metadata.sm_scale,
-            M,
+            bias,
+            sm_scale,
+            q_scale,
+            k_scale,
+            v_scale,
+            p_scale,
+            p_descale,
+            o_descale,
+            None,
             o,
             *q_strides,
             *k_strides,
             *v_strides,
             *o_strides,
             *bias_strides,
-            *alibi_strides,
-            q_descale,
-            k_descale,
-            p_scale,
-            p_descale,
-            o_descale,
-            v_descale,
-            q_descale.numel() == 1 if q_descale is not None else False,
-            k_descale.numel() == 1 if k_descale is not None else False,
-            p_descale.numel() == 1 if p_descale is not None else False,
-            v_descale.numel() == 1 if v_descale is not None else False,
-            metadata.cu_seqlens_q,
-            metadata.cu_seqlens_k,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            dropout_p=0.0,
             philox_seed=philox_seed,
             philox_offset_base=philox_offset,
             encoded_softmax=encoded_softmax,
-            alibi_slopes=metadata.alibi_slopes,
             HQ=nheads_q,
             HK=nheads_k,
-            IS_ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=metadata.max_seqlens_q,
-            MAX_SEQLENS_K=metadata.max_seqlens_k,
-            IS_CAUSAL=metadata.causal,
-            VARLEN=metadata.varlen,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=arg_max_seqlens_q,
+            MAX_SEQLENS_K=arg_max_seqlens_k,
+            IS_CAUSAL=causal,
+            VARLEN=True,
             BLOCK_DMODEL=padded_d_model,
-            USE_BIAS=metadata.bias is not None,
-            USE_ALIBI=metadata.alibi_slopes is not None,
-            SHOULD_RETURN_ENCODED_SOFTMAX=metadata.return_encoded_softmax,
-            IS_EIGHT_BIT=metadata.eight_bit,
-            USE_P_SCALE=metadata.eight_bit and metadata.use_p_scale,
-            IS_EIGHT_BIT_KV=metadata.eight_bit and metadata.eight_bit_kv,
-            NUM_CU=NUM_CU,
-            B=batch,
-            QUANT_DTYPE=metadata.eight_bit_dtype_triton)
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False,
+            USE_FP8=use_fp8,
+            USE_FP8_OUT=fp8_out_scale is not None,
+        )
 
         ctx.grid = grid
-        ctx.sm_scale = metadata.sm_scale
+        ctx.sm_scale = sm_scale
         ctx.BLOCK_DMODEL = head_size
-        ctx.causal = metadata.causal
-        ctx.alibi_slopes = metadata.alibi_slopes
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
         ctx.philox_seed = philox_seed
         ctx.philox_offset = philox_offset
         ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = metadata.return_encoded_softmax
+        ctx.return_encoded_softmax = False
         return o, encoded_softmax
 
 
-triton_attention_rocm = _attention.apply
-
-
-def scale_fp8(t, scale=None):
-    t_scaled, scale_out = ops.scaled_fp8_quant(t.reshape(-1, t.shape[-1]),
-                                               scale)
-    return t_scaled.reshape(t.shape), scale_out
-
-
-def maybe_quantize_fp8(t, scale):
-    eight_bit_dtype = current_platform.fp8_dtype()
-    if t.dtype != eight_bit_dtype:
-        t, _ = scale_fp8(t, scale)
-    return t
-
-
-def check_and_maybe_quantize_qkv(q, k, v, fp8_scales):
-    (q_scale, k_scale, v_scale, p_scale) = fp8_scales
-
-    q = maybe_quantize_fp8(q, q_scale)
-    k = maybe_quantize_fp8(k, k_scale)
-    v = maybe_quantize_fp8(v, v_scale)
-
-    return q, k, v
-
-
-# query   - [num_tokens, num_heads, head_size]
-# key     - [num_tokens, num_kv_heads, head_size]
-# value   - [num_tokens, num_kv_heads, head_size
-# output  - [num_tokens, num_heads, head_size]
-def triton_attention(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    o: torch.Tensor,
-    cu_seqlens_q: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    max_seqlens_q: int,
-    max_seqlens_k: int,
-    causal: bool = False,
-    sm_scale: float = 1.0,
-    bias: Optional[torch.Tensor] = None,
-    fp8_scales: Optional[tuple[float, ...]] = None,
-    input_scale: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    if fp8_scales is not None:
-        q_descale, k_descale, v_descale, p_scale = fp8_scales
-    else:
-        q_descale = k_descale = v_descale = p_scale = None
-
-    attn_metadata = MetaData(sm_scale=sm_scale,
-                             max_seqlens_q=max_seqlens_q,
-                             max_seqlens_k=max_seqlens_k,
-                             causal=causal,
-                             bias=bias,
-                             output_dtype=q.dtype,
-                             cu_seqlens_q=cu_seqlens_q,
-                             cu_seqlens_k=cu_seqlens_k,
-                             q_descale=q_descale,
-                             k_descale=k_descale,
-                             v_descale=v_descale,
-                             p_scale=p_scale,
-                             o_scale=input_scale)
-
-    if fp8_scales is not None:
-        q, k, v = check_and_maybe_quantize_qkv(q, k, v, fp8_scales)
-
-    return triton_attention_rocm(q, k, v, o, attn_metadata)
+triton_attention = _attention.apply
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 30e61b6d82639cf91d81470b78598ca98236f88b..56d78ed5ea6ee0a024e58c733589189f2f0855f3 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 4bced779785afab9dfca36d1fdb3c4322222c4bf..92c09e6dd0640c66945a198525706f4ad115ef57 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Authors:
 #  - Burkhard Ringlein <ngl@zurich.ibm.com>
@@ -29,41 +30,42 @@ def apply_softcap(S, x):
 
 @triton.jit
 def kernel_unified_attention_2d(
-    output_ptr,  # [num_tokens, num_query_heads, head_size]
-    query_ptr,  # [num_tokens, num_query_heads, head_size]
-    key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-    value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-    seq_lens_ptr,  # [num_seqs]
-    alibi_slopes_ptr,  # [num_query_heads]
-    scale,  # float32
-    k_scale,  # float32
-    v_scale,  # float32
-    softcap,  # float32
-    num_query_heads: tl.constexpr,  # int
-    num_queries_per_kv: tl.constexpr,  # int
-    block_table_stride: tl.int64,  # int
-    query_stride_0: tl.int64,  # int
-    query_stride_1: tl.int64,  # int, should be equal to head_size
-    output_stride_0: tl.int64,  # int
-    output_stride_1: tl.int64,  # int, should be equal to head_size
-    BLOCK_SIZE: tl.constexpr,  # int
-    HEAD_SIZE: tl.constexpr,  # int
-    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-    USE_ALIBI_SLOPES: tl.constexpr,  # bool
-    USE_SOFTCAP: tl.constexpr,  # bool
-    SLIDING_WINDOW: tl.constexpr,  # int
-    stride_k_cache_0: tl.int64,  # int
-    stride_k_cache_1: tl.int64,  # int
-    stride_k_cache_2: tl.int64,  # int
-    stride_k_cache_3: tl.constexpr,  # int
-    stride_v_cache_0: tl.int64,  # int
-    stride_v_cache_1: tl.int64,  # int
-    stride_v_cache_2: tl.int64,  # int
-    stride_v_cache_3: tl.constexpr,  # int
-    query_start_len_ptr,  # [num_seqs+1]
-    BLOCK_Q: tl.constexpr,  # int
-    num_seqs: tl.int32,
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
 ):
 
     q_block_global_idx = tl.program_id(0)
@@ -94,15 +96,13 @@ def kernel_unified_attention_2d(
     if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
         return
 
-    offs_m = tl.arange(0, BLOCK_Q * num_queries_per_kv)
+    offs_m = tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, HEAD_SIZE_PADDED)
-
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
     query_offset_1 = kv_head_idx * num_queries_per_kv + \
         offs_m % num_queries_per_kv
-
     query_offset = (query_offset_0[:, None] * query_stride_0 +
                     query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
 
@@ -110,7 +110,7 @@ def kernel_unified_attention_2d(
     query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
     query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
 
-    # Q : (BLOCK_Q * num_queries_per_kv, HEAD_SIZE,)
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
     Q = tl.load(
         query_ptr + query_offset,
         mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
@@ -119,12 +119,9 @@ def kernel_unified_attention_2d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    M = tl.full([BLOCK_Q * num_queries_per_kv],
-                float("-inf"),
-                dtype=tl.float32)
-    L = tl.full([BLOCK_Q * num_queries_per_kv], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_Q * num_queries_per_kv, HEAD_SIZE_PADDED],
-                   dtype=tl.float32)
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
 
     # sequence len for this particular sequence
     seq_len = tl.load(seq_lens_ptr + seq_idx)
@@ -183,13 +180,12 @@ def kernel_unified_attention_2d(
         else:
             V = V_load
 
-        seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        seq_offset = j * BLOCK_SIZE + offs_n
 
         seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
 
-        # S : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
-        S = tl.zeros(shape=(BLOCK_Q * num_queries_per_kv, BLOCK_SIZE),
-                     dtype=tl.float32)
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
 
         S += scale * tl.dot(Q, K)
 
@@ -207,29 +203,29 @@ def kernel_unified_attention_2d(
             S += alibi_slope[:, None] * (seq_offset - context_len)
 
         # compute running maximum
-        # m_j : (BLOCK_Q * num_queries_per_kv,)
+        # m_j : (BLOCK_M,)
         m_j = tl.maximum(M, tl.max(S, axis=1))
         # For sliding window there's a chance the max is -inf due to masking of
         # the entire row. In this case we need to set m_j 0 to avoid NaN
         m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
 
-        # P : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
+        # P : (BLOCK_M, BLOCK_SIZE)
         P = tl.exp(S - m_j[:, None])
 
-        # l_j : (BLOCK_Q * num_queries_per_kv,)
+        # l_j : (BLOCK_M,)
         l_j = tl.sum(P, axis=1)
 
-        # alpha : (BLOCK_Q * num_queries_per_kv, )
+        # alpha : (BLOCK_M, )
         alpha = tl.exp(M - m_j)
 
-        # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc = acc * alpha[:, None]
 
         # update constants
         L = L * alpha + l_j
         M = m_j
 
-        # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,)
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
     # epilogue
@@ -334,4 +330,5 @@ def unified_attention(
         query_start_len_ptr=cu_seqlens_q,
         BLOCK_Q=BLOCK_Q,
         num_seqs=num_seqs,
+        BLOCK_M=BLOCK_M,
     )
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ebbdea27f413eeec8ed2bc4ea872dc1bf7e84177..cb577fa6730232193f1aebe3e21d257f17c101af 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from contextlib import contextmanager
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index ca88549f3f729685a469ba8feebd323c63266e8a..69cde06fd72e96e5d36b4e034d05bba1ca3616de 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 from vllm import envs
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 967510abaeb9b8507b7f94dc9ea9dc5bd159ab8e..f3bc4218323d80f45fca8cb820065edd827648bc 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
 
+from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob
 
 if TYPE_CHECKING:
@@ -19,6 +21,7 @@ class BeamSearchSequence:
     # The tokens includes the prompt.
     tokens: list[int]
     logprobs: list[dict[int, Logprob]]
+    lora_request: Optional[LoRARequest] = None
     cum_logprob: float = 0.0
     text: Optional[str] = None
     finish_reason: Optional[str] = None
@@ -41,6 +44,7 @@ class BeamSearchInstance:
     def __init__(
         self,
         prompt_tokens: list[int],
+        lora_request: Optional[LoRARequest] = None,
         logprobs: Optional[list[dict[int, Logprob]]] = None,
         **kwargs,
     ):
@@ -48,6 +52,7 @@ class BeamSearchInstance:
             BeamSearchSequence(
                 tokens=prompt_tokens,
                 logprobs=[] if logprobs is None else list(logprobs),
+                lora_request=lora_request,
                 **kwargs,
             )
         ]
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 74a9b2b03391b9a30e12dcaf1dc67a30b9ff5395..4da9f7368e6314b6bde377762f1e59f8113107ca 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
@@ -9,9 +10,6 @@ generation. Supported dataset types include:
   - BurstGPT
   - HuggingFace
   - VisionArena
-
-TODO: Implement CustomDataset to parse a JSON file and convert its contents into
-SampleRequest instances, similar to the approach used in ShareGPT.
 """
 import base64
 import io
@@ -34,6 +32,23 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import PlaceholderModule
+
+try:
+    from datasets import load_dataset
+except ImportError:
+    datasets = PlaceholderModule("datasets")
+    load_dataset = datasets.placeholder_attr("load_dataset")
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")
 
 logger = logging.getLogger(__name__)
 
@@ -62,6 +77,7 @@ class SampleRequest:
 
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
 
     def __init__(
         self,
@@ -316,20 +332,22 @@ class RandomDataset(BenchmarkDataset):
         )
 
         vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
 
         prefix_token_ids = (np.random.randint(
             0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
 
         # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(input_len * (1 - range_ratio))
-        input_high = int(input_len * (1 + range_ratio))
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
         output_low = int(output_len * (1 - range_ratio))
         output_high = int(output_len * (1 + range_ratio))
 
         # Add logging for debugging
-        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
-        logger.info("Sampling output_len from [%s, %s]", output_low,
-                    output_high)
+        logger.info(
+            "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
+            input_low, input_high, output_low, output_high)
 
         input_lens = np.random.randint(input_low,
                                        input_high + 1,
@@ -345,6 +363,17 @@ class RandomDataset(BenchmarkDataset):
                          vocab_size).tolist()
             token_sequence = prefix_token_ids + inner_seq
             prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            re_encoded_sequence = tokenizer.encode(
+                prompt, add_special_tokens=False)[:input_lens[i]]
+            prompt = tokenizer.decode(re_encoded_sequence)
             total_input_len = prefix_len + int(input_lens[i])
             requests.append(
                 SampleRequest(
@@ -429,6 +458,99 @@ class ShareGPTDataset(BenchmarkDataset):
         return samples
 
 
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
+                                      lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset.")
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{
+                        "role": "user",
+                        "content": prompt
+                    }],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -530,13 +652,6 @@ class BurstGPTDataset(BenchmarkDataset):
         if self.dataset_path is None:
             raise ValueError("dataset_path must be provided for loading data.")
 
-        try:
-            import pandas as pd
-        except ImportError as e:
-            raise ImportError(
-                "Pandas is required for BurstGPTDataset. Please install it "
-                "using `pip install pandas`.") from e
-
         df = pd.read_csv(self.dataset_path)
         # Filter to keep only GPT-4 rows.
         gpt4_df = df[df["Model"] == "GPT-4"]
@@ -611,13 +726,6 @@ class HuggingFaceDataset(BenchmarkDataset):
 
     def load_data(self) -> None:
         """Load data from HuggingFace datasets."""
-        try:
-            from datasets import load_dataset
-        except ImportError as e:
-            raise ImportError(
-                "Hugging Face datasets library is required for this dataset. "
-                "Please install it using `pip install datasets`.") from e
-
         self.data = load_dataset(
             self.dataset_path,
             name=self.dataset_subset,
@@ -637,6 +745,7 @@ class ConversationDataset(HuggingFaceDataset):
     SUPPORTED_DATASET_PATHS = {
         'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
     }
+    IS_MULTIMODAL = True
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -701,6 +810,7 @@ class VisionArenaDataset(HuggingFaceDataset):
         "lmarena-ai/vision-arena-bench-v0.1":
         lambda x: x["turns"][0][0]["content"]
     }
+    IS_MULTIMODAL = True
 
     def sample(
         self,
@@ -772,7 +882,77 @@ class InstructCoderDataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
                 SampleRequest(
@@ -858,18 +1038,18 @@ def _format_zeta_prompt(
         sample: dict,
         original_start_marker: str = "<|editable_region_start|>") -> dict:
     """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
-    
-    This function formats examples from the NEP dataset 
-    into prompts and expected outputs. It could be 
+
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
     further extended to support more NEP datasets.
-    
+
     Args:
-        sample: The dataset sample containing events, 
+        sample: The dataset sample containing events,
             inputs, and outputs.
-        original_start_marker: The marker indicating the 
-            start of the editable region. Defaults to 
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
             "<|editable_region_start|>".
-            
+
     Returns:
         A dictionary with the formatted prompts and expected outputs.
     """
@@ -919,3 +1099,87 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                 break
         self.maybe_oversample_requests(samples, num_requests)
         return samples
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """  # noqa: E501
+
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr",
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = (
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>")
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        if skipped:
+            logger.warning(
+                "%d samples discarded from dataset due to"
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py
index 32767a896070c56aaef6e41bc46df4b23f99b7c7..aba60edc58cbfa77aa21e1c8a089e3cb8ca10ccd 100644
--- a/vllm/benchmarks/endpoint_request_func.py
+++ b/vllm/benchmarks/endpoint_request_func.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """The request function for API endpoints."""
 
+import io
 import json
 import os
 import sys
@@ -24,11 +26,11 @@ class RequestFuncInput:
     output_len: int
     model: str
     model_name: Optional[str] = None
-    best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
+    language: Optional[str] = None
 
 
 @dataclass
@@ -71,7 +73,7 @@ async def async_request_openai_completions(
                 if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
-            "best_of": request_func_input.best_of,
+            "repetition_penalty": 1.0,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
@@ -154,7 +156,226 @@ async def async_request_openai_completions(
     return output
 
 
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model":
+            request_func_input.model_name
+            if request_func_input.model_name else request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content
+                },
+            ],
+            "temperature":
+            0.0,
+            "max_completion_tokens":
+            request_func_input.output_len,
+            "stream":
+            True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model":
+            request_func_input.model_name
+            if request_func_input.model_name else request_func_input.model,
+            "temperature":
+            0.0,
+            "max_completion_tokens":
+            request_func_input.output_len,
+            "stream":
+            True,
+            "language":
+            "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage":
+            True,
+            "stream_continuous_usage_stats":
+            True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS = {
-    "openai-comp": async_request_openai_completions,
+    "vllm": async_request_openai_completions,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 2c992727b1392055abfc51a6178918b8d5211db1..5c6124db80b4fb6b438c595b44788033ea63ab5d 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""
 
 import argparse
@@ -6,13 +7,12 @@ import dataclasses
 import json
 import os
 import time
-from pathlib import Path
 from typing import Any, Optional
 
 import numpy as np
-import torch
 from tqdm import tqdm
 
+import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
                                    write_to_json)
@@ -59,13 +59,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         action="store_true",
         help="profile the generation process of a single batch",
     )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default=None,
-        help=("path to save the pytorch profiler output. Can be visualized "
-              "with ui.perfetto.dev or Tensorboard."),
-    )
     parser.add_argument(
         "--output-json",
         type=str,
@@ -82,12 +75,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser = EngineArgs.add_cli_args(parser)
     # V1 enables prefix caching by default which skews the latency
     # numbers. We need to disable prefix caching by default.
-    parser.set_defaults(enable_prefix_caching=True)
+    parser.set_defaults(enable_prefix_caching=False)
 
 
 def main(args: argparse.Namespace):
-    print(args)
-
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler.")
     engine_args = EngineArgs.from_cli_args(args)
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
@@ -106,7 +101,6 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
         detokenize=not args.disable_detokenize,
     )
-    print(sampling_params)
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
@@ -131,16 +125,9 @@ def main(args: argparse.Namespace):
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir)),
-            ) as p:
-                llm_generate()
-            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
         else:
             start_time = time.perf_counter()
             llm_generate()
@@ -153,10 +140,7 @@ def main(args: argparse.Namespace):
         run_to_completion(profile_dir=None)
 
     if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = (Path(".") / "vllm_benchmark_result" /
-                           f"latency_result_{time.time()}")
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index dc0ec32194866f0aecc145bfb31b0e41f58f3612..019ebcf8d50416a1b1d78c4c511ab519b75d151a 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands
@@ -7,7 +8,7 @@ to launch the vLLM OpenAI API server:
 
 On the client side, run:
     vllm bench serve \
-        --endpoint-type <endpoint_type. Default 'openi-comp'> \
+        --endpoint-type <endpoint_type. Default 'openai'> \
         --label <benchmark result label. Default using endpoint_type> \
         --model <your_model> \
         --dataset-name <dataset_name. Default 'random'> \
@@ -22,7 +23,7 @@ import os
 import random
 import time
 import warnings
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Optional
@@ -31,7 +32,14 @@ import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
+from vllm.benchmarks.datasets import (AIMODataset, ASRDataset, BurstGPTDataset,
+                                      ConversationDataset, HuggingFaceDataset,
+                                      InstructCoderDataset, MTBenchDataset,
+                                      NextEditPredictionDataset, RandomDataset,
+                                      SampleRequest, ShareGPTDataset,
+                                      SonnetDataset, VisionArenaDataset)
 from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
+                                                   OPENAI_COMPATIBLE_BACKENDS,
                                                    RequestFuncInput,
                                                    RequestFuncOutput)
 from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
@@ -71,53 +79,18 @@ class BenchmarkMetrics:
     percentiles_e2el_ms: list[tuple[float, float]]
 
 
-def sample_random_requests(
-    prefix_len: int,
-    input_len: int,
-    output_len: int,
-    num_prompts: int,
-    range_ratio: float,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(0,
-                                         tokenizer.vocab_size,
-                                         size=prefix_len).tolist()
-
-    input_lens = np.random.randint(
-        int(input_len * range_ratio),
-        input_len + 1,
-        size=num_prompts,
-    )
-    output_lens = np.random.randint(
-        int(output_len * range_ratio),
-        output_len + 1,
-        size=num_prompts,
-    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
-    input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode(prefix_token_ids +
-                                  [(offsets[i] + i + j) % tokenizer.vocab_size
-                                   for j in range(input_lens[i])])
-
-        input_requests.append((prompt, int(prefix_len + input_lens[i]),
-                               int(output_lens[i]), None))
-
-    return input_requests
-
-
 async def get_request(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-) -> AsyncGenerator[tuple[str, int, int], None]:
+) -> AsyncGenerator[SampleRequest, None]:
     """
     Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
 
     Args:
         input_requests:
-            A list of input requests, each represented as a tuple.
+            A list of input requests, each represented as a SampleRequest.
         request_rate:
             The rate at which requests are generated (requests/s).
         burstiness (optional):
@@ -129,7 +102,7 @@ async def get_request(
             in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
-    input_requests = iter(input_requests)
+    input_requests: Iterable[SampleRequest] = iter(input_requests)
 
     # Calculate scale parameter theta to maintain the desired request_rate.
     assert burstiness > 0, (
@@ -151,7 +124,7 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
@@ -184,7 +157,7 @@ def calculate_metrics(
         if outputs[i].success:
             output_len = outputs[i].output_tokens
 
-            if output_len is None:
+            if not output_len:
                 # We use the tokenizer to count the number of output tokens
                 # for some serving backends instead of looking at
                 # len(outputs[i].itl) since multiple output tokens may be
@@ -194,7 +167,7 @@ def calculate_metrics(
                     tokenizer(outputs[i].generated_text,
                               add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
-            total_input += input_requests[i][1]
+            total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
                 latency_minus_ttft = outputs[i].latency - outputs[i].ttft
@@ -277,19 +250,19 @@ async def benchmark(
     model_id: str,
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     logprobs: Optional[int],
-    best_of: int,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
     profile: bool,
     selected_percentile_metrics: list[str],
-    selected_percentiles: list[str],
+    selected_percentiles: list[float],
     ignore_eos: bool,
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
-    lora_modules: Optional[list[str]],
+    lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
 ):
     if endpoint_type in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -298,11 +271,13 @@ async def benchmark(
 
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0])
-    if endpoint_type != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat endpoint.
-        raise ValueError("Multi-modal content is only supported on "
-                         "'openai-chat' endpoint_type.")
+        input_requests[0].prompt,
+        input_requests[0].prompt_len,
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
+
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
@@ -311,9 +286,9 @@ async def benchmark(
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         logprobs=logprobs,
-        best_of=best_of,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
+        extra_body=extra_body,
     )
 
     test_output = await request_func(request_func_input=test_input)
@@ -338,9 +313,9 @@ async def benchmark(
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
                                          logprobs=logprobs,
-                                         best_of=best_of,
                                          multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
             print("Profiler started")
@@ -374,7 +349,12 @@ async def benchmark(
     benchmark_start_time = time.perf_counter()
     tasks: list[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request
+        prompt, prompt_len, output_len, mm_content = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+        )
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
             req_lora_module = next(lora_modules)
@@ -387,9 +367,9 @@ async def benchmark(
                                               prompt_len=prompt_len,
                                               output_len=output_len,
                                               logprobs=logprobs,
-                                              best_of=best_of,
                                               multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -405,7 +385,6 @@ async def benchmark(
             prompt_len=test_prompt_len,
             output_len=test_output_len,
             logprobs=logprobs,
-            best_of=best_of,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -567,7 +546,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--endpoint-type",
         type=str,
-        default="openai-comp",
+        default="openai",
         choices=list(ASYNC_REQUEST_FUNCS.keys()),
     )
     parser.add_argument(
@@ -596,9 +575,16 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--dataset-name",
         type=str,
         default="random",
-        choices=["random"],
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
         help="Name of the dataset to benchmark on.",
     )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
     parser.add_argument(
         "--max-concurrency",
         type=int,
@@ -624,13 +610,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help=
         "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
-    parser.add_argument(
-        "--best-of",
-        type=int,
-        default=1,
-        help="Generates `best_of` sequences per prompt and "
-        "returns the best one.",
-    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--num-prompts",
@@ -691,6 +670,17 @@ def add_cli_args(parser: argparse.ArgumentParser):
         action="store_true",
         help="Specify to save benchmark results to a json file",
     )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
     parser.add_argument(
         "--metadata",
         metavar="KEY=VALUE",
@@ -733,6 +723,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default="99",
         help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\"."
         "Use \"--percentile-metrics\" to select metrics.",
     )
     parser.add_argument(
@@ -745,7 +736,41 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "separated by spaces. Allowed request level metric names are "
         "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    # group for dataset specific arguments
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.",
+    )
 
     random_group = parser.add_argument_group("random dataset options")
     random_group.add_argument(
@@ -765,9 +790,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
     random_group.add_argument(
         "--random-range-ratio",
         type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
     random_group.add_argument(
         "--random-prefix-len",
@@ -778,6 +805,54 @@ def add_cli_args(parser: argparse.ArgumentParser):
         " request is [random-prefix-len, "
         " random-prefix-len + random-prefix-len * random-range-ratio).")
 
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument("--hf-subset",
+                          type=str,
+                          default=None,
+                          help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split",
+                          type=str,
+                          default=None,
+                          help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on "
+        "openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).",
+    )
+
     parser.add_argument(
         '--tokenizer-mode',
         type=str,
@@ -826,27 +901,142 @@ def main(args: argparse.Namespace):
     tokenizer = get_tokenizer(tokenizer_id,
                               tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
-    # TODO: This should be refactored to use the benchmark_dataset.py
-    # in later PRs.
+
     if args.dataset_name is None:
         raise ValueError(
             "Please specify '--dataset-name' and the corresponding "
             "'--dataset-path' if required.")
-    elif args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            prefix_len=args.random_prefix_len,
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
+
+    if args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
+        if args.backend == "openai-chat":
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=False,
+            )
+        else:
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset.")
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=True,
+            )
+
+    elif args.dataset_name == "hf":
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = VisionArenaDataset
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
+            args.hf_split = "train"
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+            dataset_class = NextEditPredictionDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
+
+        if dataset_class.IS_MULTIMODAL and endpoint_type not in [
+                "openai-chat",
+                "openai-audio",
+        ]:
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and "
+                "'openai-audio' backend.")
+        input_requests = dataset_class(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            random_seed=args.seed,
+        ).sample(
+            num_requests=args.num_prompts,
             tokenizer=tokenizer,
+            output_len=args.hf_output_len,
         )
 
     else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
-
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt":
+            lambda: ShareGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).sample(
+                                        tokenizer=tokenizer,
+                                        num_requests=args.num_prompts,
+                                        output_len=args.sharegpt_output_len,
+                                    ),
+            "burstgpt":
+            lambda: BurstGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random":
+            lambda: RandomDataset(dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            ),
+        }
+
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature,
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError("Sampling parameters are only supported by "
+                         "openai-compatible backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -861,7 +1051,6 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
-            best_of=args.best_of,
             request_rate=args.request_rate,
             burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,
@@ -874,10 +1063,11 @@ def main(args: argparse.Namespace):
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
             lora_modules=args.lora_modules,
+            extra_body=sampling_params,
         ))
 
     # Save config and results to json
-    if args.save_result:
+    if args.save_result or args.append_result:
         result_json: dict[str, Any] = {}
 
         # Setup
@@ -887,7 +1077,6 @@ def main(args: argparse.Namespace):
         result_json["label"] = label
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
@@ -910,6 +1099,21 @@ def main(args: argparse.Namespace):
         # Merge with benchmark result
         result_json = {**result_json, **benchmark_result}
 
+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                    "input_lens",
+                    "output_lens",
+                    "ttfts",
+                    "itls",
+                    "generated_texts",
+                    "errors",
+            ]:
+                if field in result_json:
+                    del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
+
         # Save to file
         base_model_id = model_id.split("/")[-1]
         max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@@ -919,7 +1123,13 @@ def main(args: argparse.Namespace):
         if args.result_filename:
             file_name = args.result_filename
         if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
             file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name,
+                  mode="a+" if args.append_result else "w",
+                  encoding="utf-8") as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 13110a8b4db3f67fd155b32c970848d85db3eeaa..be9ea39f0c38e835d042d6938ab43100be0777c5 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
 import argparse
 import dataclasses
@@ -527,7 +528,6 @@ def main(args: argparse.Namespace):
     validate_args(args)
     if args.seed is None:
         args.seed = 0
-    print(args)
     random.seed(args.seed)
     # Sample the requests.
     tokenizer = AutoTokenizer.from_pretrained(
diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py
index 45a0ddbd5d08d65a2ecf7200bb179eff0e792ce1..f0bb99326ab4059c49674615bb18e49976660a78 100644
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index 86eb465b8f658c74f9d021e4c01eed8bdf148305..64172a9bf91d244a8e609d23ecc8d72b88cc4168 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
@@ -6,7 +7,6 @@
 import datetime
 import locale
 import os
-import re
 import subprocess
 import sys
 # Unlike the rest of the PyTorch this file must be python2 compliant.
@@ -14,6 +14,8 @@ import sys
 # Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 from collections import namedtuple
 
+import regex as re
+
 from vllm.envs import environment_variables
 
 try:
@@ -815,4 +817,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index dc3e1482e2b48a3bc0e913707be8e7242af22d29..ce4e50a2b02d11ecb3658925eb1c45e09e5feac1 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 8114cddcd9faad34324e3a25be60c04298b27450..5af3b7efed2d65e035aab33e09b5d087bfac912c 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import dataclasses
@@ -10,12 +11,13 @@ from typing import Any, Callable, Optional
 
 import torch
 import torch.fx as fx
+from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
 from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
 
 from .compiler_interface import (CompilerInterface, EagerAdaptor,
                                  InductorAdaptor, InductorStandaloneAdaptor)
@@ -28,14 +30,15 @@ logger = init_logger(__name__)
 
 def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
     if compilation_config.use_inductor:
-        if envs.VLLM_TEST_STANDALONE_COMPILE:
-            logger.info("Using InductorStandaloneAdaptor")
+        if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
+                "2.8.0"):
+            logger.debug("Using InductorStandaloneAdaptor")
             return InductorStandaloneAdaptor()
         else:
-            logger.info("Using InductorAdaptor")
+            logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
     else:
-        logger.info("Using EagerAdaptor")
+        logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
 
 
@@ -269,7 +272,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
             self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
             for t in args
         ]
-        with self.fake_mode:
+        with self.fake_mode, enable_python_dispatcher():
             return super().run(*fake_args)
 
     def call_module(self, target: torch.fx.node.Target,
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
index 84d1e1f77739e8f3fdaf01767acaefff718c4bee..4d7aeeb4d03e3f746b3505df3c98812906650b36 100644
--- a/vllm/compilation/base_piecewise_backend.py
+++ b/vllm/compilation/base_piecewise_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Protocol
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index f651ee6912abb4fc0c74ae26aa1aa630ba21fdeb..f754fc2388b20639ef4a2f28dc98298177d0e78c 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 21af5eb76ee8a42fc7af1ad4459b94096ce22a18..36c810ec2dc96ee1804255792aa92d4b90287a44 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
 import hashlib
@@ -12,6 +13,7 @@ import torch._inductor.compile_fx
 import torch.fx as fx
 
 import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.utils import is_torch_equal_or_newer
 
@@ -154,7 +156,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
     This is not on by default yet, but we plan to turn it on by default for
     PyTorch 2.8.
 
-    Use VLLM_TEST_STANDALONE_COMPILE to toggle this on or off.
+    Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off.
     """
     name = "inductor_standalone"
 
@@ -175,6 +177,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
@@ -262,6 +265,7 @@ class InductorAdaptor(CompilerInterface):
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
         from torch._inductor.compile_fx import compile_fx
         current_config = {}
         if compiler_config is not None:
@@ -412,8 +416,14 @@ class InductorAdaptor(CompilerInterface):
         # compilation cache. So turn off the checks if we disable the
         # compilation cache.
         if not envs.VLLM_DISABLE_COMPILE_CACHE:
-            assert hash_str is not None, (
-                "failed to get the hash of the compiled graph")
+            if hash_str is None:
+                raise RuntimeError(
+                    "vLLM failed to compile the model. The most "
+                    "likely reason for this is that a previous compilation "
+                    "failed, leading to a corrupted compilation artifact. "
+                    "We recommend trying to "
+                    "remove ~/.cache/vllm/torch_compile_cache and try again "
+                    "to see the real issue. ")
             assert file_path is not None, (
                 "failed to get the file path of the compiled graph")
         return compiled_graph, (hash_str, file_path)
@@ -528,6 +538,7 @@ class EagerAdaptor(CompilerInterface):
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_eager_compiles += 1
         # we don't need to compile the graph, just return the graph itself.
         # It does not support caching, return None for the handle.
         return graph, None
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 5be452593c62019919bbd20a09b8b17e78e0f5ef..165347cfccef731b9d314216a7b010ddb7f48951 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import dataclasses
@@ -14,7 +15,11 @@ class CompilationCounter:
     # not including the splitting ops
     num_piecewise_capturable_graphs_seen: int = 0
     num_backend_compilations: int = 0
-    num_cudagraph_caputured: int = 0
+    num_cudagraph_captured: int = 0
+    # InductorAdapter.compile calls
+    num_inductor_compiles: int = 0
+    # EagerAdapter.compile calls
+    num_eager_compiles: int = 0
 
     def clone(self) -> "CompilationCounter":
         return copy.deepcopy(self)
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 0ad480e28cd707ae93d4b9f0a59f933ca4ccc95f..993def49af700257288765c7ce8882f9b2533fe5 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from contextlib import ExitStack
@@ -192,7 +193,7 @@ class CUDAPiecewiseBackend:
             entry.output = weak_ref_tensors(output)
             entry.cudagraph = cudagraph
 
-            compilation_counter.num_cudagraph_caputured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index f02994c55527d28dd08ec94018902032b195665f..05e4ca9f08b36a77c3de5e07b7fbf9a217fad2f4 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from typing import Callable, Optional, TypeVar, Union, overload
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 70f3b8b6df94b126c5edc12dfb6a47bf65d97448..286221d32c1ee1ed677d7c4d1ef6e8390f4cccc2 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import operator
 from collections.abc import Iterable
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index df3b5cb6bca63e8dc11d1458e753279b2d772cb1..cfc95c1c4f426c48c659164021098b78b0eb77a4 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, NamedTuple, Optional
 
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index b9eeb0c8d2af3a0675d8caf4f113cb3a6a60dc55..9ef3889323887682066e91251059f69373919354 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import operator
 from collections.abc import Iterable
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index a9359fe1e11708db63bde00c0e38b4ec3370f5ee..810d0801e9f3850afa57b0907f6d7c07d184cdff 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
 import inspect
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 786c7c1e1859a64a6338d869812172a01b757341..1e059b59fb64d9712e69347af0ce1d0a65f84929 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index cef19f9257ed72216e3adda86112ebfeae5ac5c1..6d1893777cec6cb6d815a96d3b0a9976439df5cf 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import abc
 import operator
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 13e4cd73f8ce79038647eb99549da2b021b1dbd0..46f70dcdc6886f311d10f39877e6080b263d5cde 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Union
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 07ebd3e1b7dde715be4c11f424b58351eec49f9d..621c89a14487452b59f0223681b33c8c65e385db 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from torch import fx as fx
 
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 17dded87fe8dc107144038707db12f26d77e6ad1..d41093903480b1082190486756d87553062e8d4f 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
index 4b881d0b6f2da83a519a12ebc9b5a7ecc2a8dfed..cd3970657522ed6454c4672771f2acb2e435b7dd 100644
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Any, Optional
 
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 0fe73b72b1deea727720cb6c538acf82c2b6f1a4..3ccbf52d9fd38767777d4656c4a11a2a8af82418 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 1a8211f0ab7c6b55202a24c1be75ceb34a900c68..2a261c84c3fc398fa7620698ce5740a8424c7db2 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
@@ -40,11 +41,16 @@ class TorchCompileWrapperWithCustomDispatcher:
             # compiling the forward method
 
             backend = vllm_config.compilation_config.init_backend(vllm_config)
+            options = None
+            if isinstance(backend, str) and backend == "inductor":
+                options = get_current_vllm_config(
+                ).compilation_config.inductor_compile_config
 
             compiled_callable = torch.compile(
                 self.forward,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
+                backend=backend,
+                options=options)
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
diff --git a/vllm/config.py b/vllm/config.py
index 6e5f1a26e406ea571b8540c23eb4fb041b08a842..6b0e8c52d68ecfdd1bcb7edce71b4993a3dd67c4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import copy
@@ -11,8 +12,8 @@ import uuid
 import warnings
 from collections import Counter
 from contextlib import contextmanager
-from dataclasses import (MISSING, Field, asdict, dataclass, field, fields,
-                         is_dataclass, replace)
+from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
+                         replace)
 from functools import cached_property
 from importlib.util import find_spec
 from pathlib import Path
@@ -21,9 +22,13 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
 
 import regex as re
 import torch
+from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
+                      model_validator)
+from pydantic.dataclasses import dataclass
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
-from typing_extensions import deprecated
+from typing_extensions import deprecated, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
@@ -39,15 +44,17 @@ from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    try_get_generation_config, uses_mrope)
+    try_get_generation_config, try_get_safetensors_metadata,
+    try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
                         MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                         POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
-                        LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
-                        random_uuid, resolve_obj_by_qualname)
+                        LayerBlockType, common_broadcastable_dtype,
+                        cuda_device_count_stateless, get_cpu_memory,
+                        get_open_port, is_torch_equal_or_newer, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
@@ -57,10 +64,15 @@ if TYPE_CHECKING:
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
     from vllm.model_executor.model_loader import BaseModelLoader
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
     ConfigType = type[DataclassInstance]
 else:
+    PlacementGroup = Any
+    ExecutorBase = Any
     QuantizationConfig = Any
+    BaseModelLoader = Any
+    TensorizerConfig = Any
     ConfigType = type
 
 logger = init_logger(__name__)
@@ -92,6 +104,7 @@ HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
                                              PretrainedConfig]]
 
 
+@runtime_checkable
 class SupportsHash(Protocol):
 
     def compute_hash(self) -> str:
@@ -223,7 +236,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class ModelConfig:
     """Configuration for the model."""
 
@@ -236,7 +249,7 @@ class ModelConfig:
     task, even if the same model can be used for multiple tasks. When the model
     only supports one task, "auto" can be used to select it; otherwise, you
     must specify explicitly which task to use."""
-    tokenizer: str = None  # type: ignore
+    tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
     tokenizer_mode: TokenizerMode = "auto"
@@ -284,7 +297,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int = None  # type: ignore
+    max_model_len: SkipValidation[int] = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
@@ -295,7 +308,7 @@ class ModelConfig:
     - 25.6k -> 25,600"""
     spec_target_max_model_len: Optional[int] = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: Optional[QuantizationMethods] = None
+    quantization: SkipValidation[Optional[QuantizationMethods]] = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -371,7 +384,7 @@ class ModelConfig:
     """Initialize non-default neuron config or override default neuron config
     that are specific to Neuron devices, this argument will be used to
     configure the neuron config that can not be gathered from the vllm
-    arguments. e.g. `{"cast_logits_dtype": "bloat16"}`."""
+    arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
     pooler_config: Optional["PoolerConfig"] = field(init=False)
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
@@ -531,7 +544,24 @@ class ModelConfig:
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
-        self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype)
+
+        supported_tasks, task = self._resolve_task(self.task)
+        self.supported_tasks = supported_tasks
+        self.task = task
+        if self.task in ("draft", "generate"):
+            self.truncation_side = "left"
+        else:
+            self.truncation_side = "right"
+
+        self.pooler_config = self._init_pooler_config()
+
+        self.dtype = _get_and_verify_dtype(
+            self.model,
+            self.hf_config,
+            self.dtype,
+            is_pooling_model=self.runner_type == "pooling",
+            revision=self.revision,
+        )
 
         # Workaround for Gemma 2 which uses interleaved sliding window
         # attention, but it's not specified in its config. TODO: remove this
@@ -571,13 +601,8 @@ class ModelConfig:
 
                 sliding_window = None
 
-        self.max_model_len = _get_and_verify_max_len(
-            hf_config=self.hf_text_config,
-            max_model_len=self.max_model_len,
-            disable_sliding_window=self.disable_sliding_window,
-            sliding_window_len=self.get_hf_config_sliding_window(),
-            spec_target_max_model_len=self.spec_target_max_model_len,
-            encoder_config=self.encoder_config)
+        self.original_max_model_len = self.max_model_len
+        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
         self.multimodal_config = self._init_multimodal_config()
@@ -593,20 +618,26 @@ class ModelConfig:
             raise ValueError(
                 "`override_neuron_config` is only supported on Neuron.")
 
-        supported_tasks, task = self._resolve_task(self.task)
-        self.supported_tasks = supported_tasks
-        self.task = task
-        if self.task in ("draft", "generate"):
-            self.truncation_side = "left"
-        else:
-            self.truncation_side = "right"
-
-        self.pooler_config = self._init_pooler_config()
-
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @field_validator("quantization", mode="before")
+    @classmethod
+    def validate_quantization_before(cls, value: Any) -> Any:
+        if isinstance(value, str):
+            return value.lower()
+        return value
+
+    @model_validator(mode="after")
+    def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
+        if not isinstance(self.tokenizer, str):
+            raise ValueError("tokenizer must be a string after __post_init__.")
+        if not isinstance(self.max_model_len, int):
+            raise ValueError(
+                "max_model_len must be an integer after __post_init__.")
+        return self
+
     @property
     def registry(self):
         return ModelRegistry
@@ -618,7 +649,7 @@ class ModelConfig:
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """Pull model/tokenizer from S3 to temporary directory when needed.
-        
+
         Args:
             model: Model name or path
             tokenizer: Tokenizer name or path
@@ -672,7 +703,6 @@ class ModelConfig:
             self.model, self.revision)
 
     def _init_pooler_config(self) -> Optional["PoolerConfig"]:
-
         if self.runner_type == "pooling":
             if isinstance(self.override_pooler_config, dict):
                 self.override_pooler_config = PoolerConfig(
@@ -796,17 +826,12 @@ class ModelConfig:
         else:
             # Aliases
             if task_option == "embedding":
-                preferred_task = self._get_preferred_task(
-                    architectures, supported_tasks)
-                if preferred_task != "embed":
-                    msg = ("The 'embedding' task will be restricted to "
-                           "embedding models in a future release. Please "
-                           "pass `--task classify`, `--task score`, or "
-                           "`--task reward` explicitly for other pooling "
-                           "models.")
-                    warnings.warn(msg, DeprecationWarning, stacklevel=2)
+                msg = ("The 'embedding' task has been renamed to "
+                       "'embed', please use the new name. The old name "
+                       "will be removed in v1.0.")
+                warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-                task_option = preferred_task or "embed"
+                task_option = "embed"
 
             if task_option not in supported_tasks:
                 msg = (
@@ -833,8 +858,7 @@ class ModelConfig:
             "quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
         ]
         if self.quantization is not None:
-            self.quantization = cast(QuantizationMethods,
-                                     self.quantization.lower())
+            self.quantization = cast(QuantizationMethods, self.quantization)
 
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
@@ -1346,6 +1370,16 @@ class ModelConfig:
     @property
     def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
+        """
+        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
+        True to enable cross-attention
+        Neuron needs all multimodal data to be in the decoder and does not
+        need to explicitly enable cross-attention
+        """
+        if (current_platform.is_neuron()
+                and self.hf_config.model_type == "mllama"):
+            return False
+
         return is_encoder_decoder(self.hf_config)
 
     @property
@@ -1386,6 +1420,28 @@ class ModelConfig:
     def matryoshka_dimensions(self):
         return getattr(self.hf_config, "matryoshka_dimensions", None)
 
+    def get_and_verify_max_len(self, max_model_len: int):
+        max_model_len = _get_and_verify_max_len(
+            hf_config=self.hf_text_config,
+            max_model_len=max_model_len,
+            disable_sliding_window=self.disable_sliding_window,
+            sliding_window_len=self.get_hf_config_sliding_window(),
+            spec_target_max_model_len=self.spec_target_max_model_len,
+            encoder_config=self.encoder_config)
+
+        tokenizer_config = try_get_tokenizer_config(
+            self.tokenizer,
+            trust_remote_code=self.trust_remote_code,
+            revision=self.tokenizer_revision)
+
+        if tokenizer_config is None:
+            return max_model_len
+
+        model_max_length = tokenizer_config.get("model_max_length",
+                                                max_model_len)
+        max_model_len = min(max_model_len, model_max_length)
+        return max_model_len
+
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
@@ -1397,7 +1453,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: BlockSize = None  # type: ignore
+    block_size: SkipValidation[BlockSize] = None  # type: ignore
     """Size of a contiguous cache block in number of tokens. This is ignored on
     neuron devices and set to `--max-model-len`. On CUDA devices, only block
     sizes up to 32 are supported. On HPU devices, block size defaults to 128.
@@ -1619,7 +1675,8 @@ class LoadConfig:
     download_dir: Optional[str] = None
     """Directory to download and load the weights, default to the default
     cache directory of Hugging Face."""
-    model_loader_extra_config: dict = field(default_factory=dict)
+    model_loader_extra_config: Union[dict, TensorizerConfig] = field(
+        default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
     corresponding to the chosen load_format."""
     ignore_patterns: Optional[Union[list[str], str]] = None
@@ -1699,6 +1756,8 @@ class ParallelConfig:
     """Port for data parallel messaging."""
     data_parallel_master_port: int = 29500
     """Port of the data parallel master."""
+    data_parallel_backend: str = "mp"
+    """Backend to use for data parallel, either "mp" or "ray"."""
     enable_expert_parallel: bool = False
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
     max_parallel_loading_workers: Optional[int] = None
@@ -1747,6 +1806,10 @@ class ParallelConfig:
     rank: int = 0
     """Global rank in distributed setup."""
 
+    enable_multimodal_encoder_data_parallel: bool = False
+    """ Use data parallelism instead of tensor parallelism for vision encoder.
+    Only support LLama4 for now"""
+
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
@@ -1806,6 +1869,8 @@ class ParallelConfig:
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
         factors.append(self.enable_expert_parallel)
+        factors.append(self.data_parallel_size)
+        factors.append(envs.VLLM_ALL2ALL_BACKEND)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
@@ -1854,6 +1919,8 @@ class ParallelConfig:
             if current_platform.is_neuron():
                 # neuron uses single process to control multiple devices
                 backend = "uni"
+            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+                backend = "uni"
             elif (current_platform.is_cuda()
                   and cuda_device_count_stateless() < self.world_size):
                 if not ray_found:
@@ -1862,6 +1929,10 @@ class ParallelConfig:
                                      "please install Ray with `pip install "
                                      "ray`.") from ray_utils.ray_import_err
                 backend = "ray"
+            elif self.data_parallel_backend == "ray":
+                logger.info("Using ray distributed inference because "
+                            "data_parallel_backend is ray")
+                backend = "ray"
             elif ray_found:
                 if self.placement_group:
                     backend = "ray"
@@ -1929,19 +2000,19 @@ class SchedulerConfig:
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = None  # type: ignore
+    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
     """Maximum number of tokens to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_num_seqs: int = None  # type: ignore
+    max_num_seqs: SkipValidation[int] = None  # type: ignore
     """Maximum number of sequences to be processed in a single iteration.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    max_model_len: int = None  # type: ignore
+    max_model_len: SkipValidation[int] = None  # type: ignore
     """Maximum length of a sequence (including prompt and generated text). This
     is primarily set in `ModelConfig` and that value should be manually
     duplicated here."""
@@ -1980,7 +2051,7 @@ class SchedulerConfig:
     """Apply a delay (of delay factor multiplied by previous
     prompt latency) before scheduling next prompt."""
 
-    enable_chunked_prefill: bool = None  # type: ignore
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
     """If True, prefill requests can be chunked based
     on the remaining max_num_batched_tokens."""
 
@@ -2046,6 +2117,12 @@ class SchedulerConfig:
     default scheduler. Can be a class directly or the path to a class of form
     "mod.custom_class"."""
 
+    disable_hybrid_kv_cache_manager: bool = False
+    """If set to True, KV cache manager will allocate the same size of KV cache
+    for all attention layers even if there are multiple type of attention layers
+    like full attention and sliding window attention.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2202,15 +2279,15 @@ Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
-    device: Union[Device, torch.device] = "auto"
+    device: SkipValidation[Union[Device, torch.device]] = "auto"
     """Device type for vLLM execution.
-    This parameter is deprecated and will be 
-    removed in a future release. 
-    It will now be set automatically based 
+    This parameter is deprecated and will be
+    removed in a future release.
+    It will now be set automatically based
     on the current platform."""
     device_type: str = field(init=False)
     """Device type from the current platform. This is set in
@@ -2260,8 +2337,8 @@ class DeviceConfig:
             self.device = torch.device(self.device_type)
 
 
-SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
-                            "draft_model", "deepseek_mtp"]
+SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
+                            "mlp_speculator", "draft_model", "deepseek_mtp"]
 SpeculativeAcceptanceMethod = Literal["rejection_sampler",
                                       "typical_acceptance_sampler"]
 
@@ -2272,8 +2349,7 @@ class SpeculativeConfig:
     """Configuration for speculative decoding."""
 
     # General speculative decoding control
-    num_speculative_tokens: int = field(default=None,
-                                        init=True)  # type: ignore
+    num_speculative_tokens: SkipValidation[int] = None  # type: ignore
     """The number of speculative tokens, if provided. It will default to the
     number in the draft model config if present, otherwise, it is required."""
     model: Optional[str] = None
@@ -2349,26 +2425,23 @@ class SpeculativeConfig:
     """Specifies the tree structure for speculative token generation.
     """
     # required configuration params passed from engine
-    target_model_config: ModelConfig = field(default=None,
-                                             init=True)  # type: ignore
+    target_model_config: SkipValidation[ModelConfig] = None  # type: ignore
     """The configuration of the target model."""
-    target_parallel_config: ParallelConfig = field(default=None,
-                                                   init=True)  # type: ignore
+    target_parallel_config: SkipValidation[
+        ParallelConfig] = None  # type: ignore
     """The parallel configuration for the target model."""
-    enable_chunked_prefill: bool = field(default=None,
-                                         init=True)  # type: ignore
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
     """Whether vLLM is configured to use chunked prefill or not. Used for
     raising an error since it's not yet compatible with speculative decode."""
-    disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+    disable_log_stats: SkipValidation[bool] = None  # type: ignore
     """Whether to disable the periodic printing of stage times in speculative
     decoding."""
 
     # params generated in the post-init stage
-    draft_model_config: ModelConfig = field(default=None,
-                                            init=True)  # type: ignore
+    draft_model_config: SkipValidation[ModelConfig] = None  # type: ignore
     """The configuration of the draft model initialized internal."""
-    draft_parallel_config: ParallelConfig = field(default=None,
-                                                  init=True)  # type: ignore
+    draft_parallel_config: SkipValidation[
+        ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
     def compute_hash(self) -> str:
@@ -2552,7 +2625,8 @@ class SpeculativeConfig:
                     else:
                         eagle_config = EAGLEConfig(
                             self.draft_model_config.hf_config,
-                            method=self.method)
+                            method=self.method,
+                            model_type="eagle")
                         self.draft_model_config.hf_config = eagle_config
 
                 if (self.num_speculative_tokens is not None
@@ -2766,7 +2840,7 @@ LoRADType = Literal["auto", "float16", "bfloat16"]
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class LoRAConfig:
     """Configuration for LoRA."""
 
@@ -2863,7 +2937,7 @@ class LoRAConfig:
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class PromptAdapterConfig:
     """Configuration for PromptAdapters."""
 
@@ -3043,13 +3117,37 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
     "bfloat16": torch.bfloat16,
 }
 
-_ROCM_NOT_SUPPORTED_DTYPE: list[str] = []  #
+# model_type -> reason
+_FLOAT16_NOT_SUPPORTED_MODELS = {
+    "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
+}
+
+
+def _is_valid_dtype(model_type: str, dtype: torch.dtype):
+    if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:  # noqa: E501, SIM103
+        return False
 
+    return True
 
-def _get_and_verify_dtype(
+
+def _check_valid_dtype(model_type: str, dtype: torch.dtype):
+    if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:
+        reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type]
+        raise ValueError(f"The model type {model_type!r} "
+                         f"does not support float16. Reason: {reason}")
+
+    return True
+
+
+def _find_dtype(
+    model_id: str,
     config: PretrainedConfig,
-    dtype: Union[str, torch.dtype],
-) -> torch.dtype:
+    *,
+    revision: Optional[str],
+):
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
     config_dtype = getattr(config, "torch_dtype", None)
@@ -3060,76 +3158,114 @@ def _get_and_verify_dtype(
         config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
         config_dtype = getattr(config.vision_config, "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "encoder_config"):
+        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
+
+    # Try to read the dtype of the weights if they are in safetensors format
+    if config_dtype is None:
+        repo_mt = try_get_safetensors_metadata(model_id, revision=revision)
+
+        if repo_mt and (files_mt := repo_mt.files_metadata):
+            param_dtypes: set[torch.dtype] = {
+                _SAFETENSORS_TO_TORCH_DTYPE[dtype_str]
+                for file_mt in files_mt.values()
+                for dtype_str in file_mt.parameter_count
+                if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE
+            }
+
+            if param_dtypes:
+                return common_broadcastable_dtype(param_dtypes)
 
     if config_dtype is None:
         config_dtype = torch.float32
 
-    if isinstance(dtype, str):
-        dtype = dtype.lower()
-        if dtype == "auto":
-            # Set default dtype from model config
-            if config_dtype == torch.float32:
-                # Following common practice, we use float16 for float32 models
-                torch_dtype = torch.float16
-            else:
-                torch_dtype = config_dtype
+    return config_dtype
 
-            if config.model_type == "plamo2":
-                logger.warning(
-                    "For PLaMo2, we cast models to bfloat16 instead of using "
-                    "float16 by default. This is because float16 does not work."
-                )
-                torch_dtype = torch.bfloat16
 
-            # Deal with torch dtype fallback for device compatibility.
-            from vllm.platforms import current_platform
-            if torch_dtype not in current_platform.supported_dtypes:
-                device_name = current_platform.get_device_name()
+def _resolve_auto_dtype(
+    model_type: str,
+    config_dtype: torch.dtype,
+    *,
+    is_pooling_model: bool,
+):
+    from vllm.platforms import current_platform
 
-                if ((capability := current_platform.get_device_capability())
-                        is None):
-                    compute_str = ""
-                else:
-                    version_str = capability.as_version_str()
-                    compute_str = f" (with compute capability {version_str})"
-                fallback_dtype = current_platform.supported_dtypes[0]
-                logger.warning(
-                    "Your %s device%s doesn't support %s. " \
-                    "Falling back to %s for compatibility.",
-                    device_name, compute_str, torch_dtype, fallback_dtype
-                    )
-                torch_dtype = fallback_dtype
+    supported_dtypes = [
+        dtype for dtype in current_platform.supported_dtypes
+        if _is_valid_dtype(model_type, dtype)
+    ]
 
-            if current_platform.is_hpu() and torch_dtype == torch.float16:
-                logger.warning(
-                    "For HPU, we cast models to bfloat16 instead of "
-                    "using float16 by default. Please specify `dtype` if you "
-                    "want to use float16.")
-                torch_dtype = torch.bfloat16
-        elif dtype == "float16" and config.model_type == "plamo2":
-            logger.warning(
-                "For PLaMo2, using float16 is unstable and might cause "
-                "unexpected behavior. Please use bfloat16 or float32 instead.")
-            torch_dtype = torch.float16
+    if is_pooling_model and torch.float16 in supported_dtypes:
+        preferred_dtype = torch.float16
+    else:
+        preferred_dtype = supported_dtypes[0]
+
+    # Downcast for float32 models
+    if config_dtype == torch.float32:
+        config_dtype = preferred_dtype
+
+    if config_dtype in supported_dtypes:
+        return config_dtype
+
+    # Ensure device compatibility
+    device_name = current_platform.get_device_name()
+    device_capability = current_platform.get_device_capability()
+
+    if device_capability is None:
+        device_str = f"{device_name!r}"
+    else:
+        version_str = device_capability.as_version_str()
+        device_str = f"{device_name!r} (with compute capability {version_str})"
+
+    logger.warning(
+        "Your device %s doesn't support %s. "
+        "Falling back to %s for compatibility.",
+        device_str,
+        config_dtype,
+        preferred_dtype,
+    )
+
+    return preferred_dtype
+
+
+def _get_and_verify_dtype(
+    model_id: str,
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+    *,
+    is_pooling_model: bool,
+    revision: Optional[str] = None,
+) -> torch.dtype:
+    config_dtype = _find_dtype(model_id, config, revision=revision)
+    model_type = config.model_type
+
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            # Set default dtype from model config
+            torch_dtype = _resolve_auto_dtype(
+                model_type,
+                config_dtype,
+                is_pooling_model=is_pooling_model,
+            )
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
-                raise ValueError(f"Unknown dtype: {dtype}")
+                raise ValueError(f"Unknown dtype: {dtype!r}")
             torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
     elif isinstance(dtype, torch.dtype):
         torch_dtype = dtype
     else:
         raise ValueError(f"Unknown dtype: {dtype}")
 
-    # Verify the dtype.
+    _check_valid_dtype(model_type, torch_dtype)
+
     if torch_dtype != config_dtype:
         if torch_dtype == torch.float32:
             # Upcasting to float32 is allowed.
             logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
-            pass
         elif config_dtype == torch.float32:
             # Downcasting from float32 to float16 or bfloat16 is allowed.
             logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
-            pass
         else:
             # Casting between float16 and bfloat16 is allowed with a warning.
             logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
@@ -3795,12 +3931,14 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = False
+    use_cudagraph: bool = envs.VLLM_USE_V1
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
     TODO: move outside cudagraph logic into compilation.
@@ -3884,25 +4022,24 @@ class CompilationConfig:
 
     def __repr__(self) -> str:
         exclude = {
-            "static_forward_context",
-            "enabled_custom_ops",
-            "disabled_custom_ops",
-            "compilation_time",
-            "bs_to_padded_graph_size",
-            "pass_config",
-            "traced_files",
+            "static_forward_context": True,
+            "enabled_custom_ops": True,
+            "disabled_custom_ops": True,
+            "compilation_time": True,
+            "bs_to_padded_graph_size": True,
+            "pass_config": True,
+            "traced_files": True,
+            "inductor_compile_config": {
+                "post_grad_custom_post_pass": True,
+            },
         }
-        include = dict()
-        for k, v in asdict(self).items():
-            if k in exclude:
-                continue
-            f = get_field(CompilationConfig, k)
-            if (d := f.default) is not MISSING and d == v:
-                continue
-            if (df := f.default_factory) is not MISSING and df() == v:
-                continue
-            include[k] = v
-        return json.dumps(include)
+        # The cast to string is necessary because Pydantic is mocked in docs
+        # builds and sphinx-argparse doesn't know the return type of decode()
+        return str(
+            TypeAdapter(CompilationConfig).dump_json(
+                self,
+                exclude=exclude,  # type: ignore[arg-type]
+                exclude_unset=True).decode())
 
     __str__ = __repr__
 
@@ -3911,7 +4048,7 @@ class CompilationConfig:
         """Parse the CLI value for the compilation config."""
         if cli_value in ["0", "1", "2", "3"]:
             return cls(level=int(cli_value))
-        return cls(**json.loads(cli_value))
+        return TypeAdapter(CompilationConfig).validate_json(cli_value)
 
     def __post_init__(self) -> None:
         count_none = self.custom_ops.count("none")
@@ -4037,7 +4174,7 @@ class CompilationConfig:
 
 
 @config
-@dataclass
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
     """Dataclass which contains all vllm-related configuration. This
     simplifies passing around the distinct configurations in the codebase.
@@ -4267,25 +4404,22 @@ class VllmConfig:
             self.model_config.verify_dual_chunk_attention_config(
                 self.load_config)
 
-        if self.cache_config is not None:
-            self.cache_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
 
-        if self.lora_config:
+        if self.lora_config is not None:
             self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_lora_support()
-        if self.prompt_adapter_config:
+        if self.prompt_adapter_config is not None:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
 
-        if self.quant_config is None and \
-            self.model_config is not None and self.load_config is not None:
+        if self.quant_config is None and self.model_config is not None:
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
         from vllm.platforms import current_platform
-        if self.scheduler_config is not None and \
-            self.model_config is not None and \
+        if self.model_config is not None and \
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
@@ -4294,9 +4428,6 @@ class VllmConfig:
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
 
-        if self.compilation_config is None:
-            self.compilation_config = CompilationConfig()
-
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -4306,15 +4437,9 @@ class VllmConfig:
             self.compilation_config.custom_ops.append("+rms_norm")
         if envs.VLLM_USE_V1 and self.model_config is not None and \
             not self.model_config.enforce_eager:
-            # NOTE(woosuk): Currently, we use inductor because the piecewise
-            # CUDA graphs do not work properly with the custom CUDA kernels.
-            # FIXME(woosuk): Disable inductor to reduce the compilation time
-            # and avoid any potential issues with the inductor.
             # FIXME(rob): Add function to set all of these.
             if not self.compilation_config.custom_ops:
                 self.compilation_config.custom_ops = ["none"]
-            self.compilation_config.use_cudagraph = True
-            self.compilation_config.use_inductor = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False
@@ -4323,8 +4448,7 @@ class VllmConfig:
 
         self._set_cudagraph_sizes()
 
-        if self.cache_config is not None and \
-            self.cache_config.cpu_offload_gb > 0 and \
+        if self.cache_config.cpu_offload_gb > 0 and \
             self.compilation_config.level != CompilationLevel.NO_COMPILATION \
                 and not envs.VLLM_USE_V1:
             logger.warning(
@@ -4346,16 +4470,16 @@ class VllmConfig:
                 "full_cuda_graph is not supported with "
                 "cascade attention. Disabling cascade attention.")
             self.model_config.disable_cascade_attn = True
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
+            self.cache_config.enable_prefix_caching = False
 
-        if (self.kv_events_config
+        if (self.kv_events_config is not None
                 and self.kv_events_config.enable_kv_cache_events
                 and not self.cache_config.enable_prefix_caching):
             logger.warning(
                 "KV cache events are on, but prefix caching is not enabled."
                 "Use --enable-prefix-caching to enable.")
-        if (self.kv_events_config and self.kv_events_config.publisher != "null"
+        if (self.kv_events_config is not None
+                and self.kv_events_config.publisher != "null"
                 and not self.kv_events_config.enable_kv_cache_events):
             logger.warning("KV cache events are disabled,"
                            "but the scheduler is configured to publish them."
@@ -4366,6 +4490,21 @@ class VllmConfig:
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+        if (envs.VLLM_USE_V1
+                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
+            # logger should only print warning message for hybrid models. As we
+            # can't know whether the model is hybrid or not now, so we don't log
+            # warning message here and will log it later.
+            if not (current_platform.is_cuda() or current_platform.is_rocm()):
+                # Hybrid KV cache manager is not supported on non-GPU platforms.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.kv_transfer_config is not None:
+                # Hybrid KV cache manager is not compatible with KV transfer.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.kv_events_config is not None:
+                # Hybrid KV cache manager is not compatible with KV events.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
         # remove the sizes that not multiple of tp_size when
@@ -4471,6 +4610,13 @@ class VllmConfig:
         self.compilation_config.init_with_cudagraph_sizes(
             batch_size_capture_list)
 
+    def recalculate_max_model_len(self, max_model_len: int):
+        model_config = self.model_config
+        max_model_len = model_config.get_and_verify_max_len(max_model_len)
+        self.model_config.max_model_len = max_model_len
+        self.scheduler_config.max_model_len = max_model_len
+        self.compute_hash()
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r},"
diff --git a/vllm/connections.py b/vllm/connections.py
index 84e32a4d5ca9cda4e9b5d88cd41e93438b042a6d..103505eb3d81fb488e18b1841f36b4811a4e71f9 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping, MutableMapping
 from pathlib import Path
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d4d31c58dc8d4c607bf012129ed1d8f7883916f9..444bb25f2830aac06e8f7df2cf3ec68b7a8ea6ad 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from typing import List, Optional
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 1966eac1cf9e0c0a8f8522a0a3e72c64f57d2952..a337007a9eaa6f3bfc46cb9b9fa0f5585590cfc9 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import deque
 from dataclasses import dataclass
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index d64142e77f37f8cb125f896957de1cdf21a5b711..ea490c32791c709ee6ca99d9e392cd6b8915e6da 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Dict, FrozenSet, List, Optional, Tuple
 
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 301656996435ba92d4b44721a1e08899be656c54..1a05881f7c0050ae8ad64e44ef7725157975f1c9 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c388366b825f2c236712d4af068b47714701d95d..dae6ead04e9c94c6f3c76186063839f996b33a04 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import deque
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1ca9e49dac371bd9fd835bc36f6e654235ac5d6c..2913a01bf34a5ec7a721f6d11522a180a97b5830 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Token blocks."""
 import sys
 from bisect import bisect_left
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 910afdd9feff1d616339fda39071494f913b9c36..e933c6ee7c8bd097021386d7e0f7294bca865c20 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
 from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c6bf6d163132ec7362c11c72b0558f1bc5fbc333..a33399204fafa123a83121b2b77f52937607cb00 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 0e363eddc8a5e933173cc42f4c3a8eeafbe85c23..7ec4768e90b1a537448f91bbb39135afa277e255 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import heapq
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 4c1182debcec1faeed3b4664a9c59cab122a8c14..ba290eeda12b59c00641294d2bd3a36ae1f280ab 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from abc import ABC, abstractmethod
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index 0f5d8ca6dc7ea4fafa526b7ea2951218a192b806..71b22942a3edd533c5151ab86ee70fec52bc0011 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Tuple
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 06d4ed470b2092f0a513d814ebdb0784def4f57c..44be855b1bfde666162f4f473f8be6ebc10f9d24 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import os
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 6fcbca628c6aa8be5aed7aeea7c553775bbe6988..942e866ed97ee3deee433d4b3c33e546e0363f09 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # cumem-based pytorch pluggable allocator to implement sleep mode.
 # other approaches tried but failed:
diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py
index 39955ddacfe947c6c673520e97acbd7a348e5ecb..e911b2a1ab284b3e8a1c38887e7541525db5d8aa 100644
--- a/vllm/distributed/__init__.py
+++ b/vllm/distributed/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .communication_op import *
 from .parallel_state import *
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index d85a41ddac2219f43b34bf675d39ae4d7778540c..0a5a95176f7c354825f6f5fac6d5f74892819829 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional, Union
 
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index a250ec89cd5ba46948d11e098ce4870d133c392e..35f2fd0ba9e2210c2f051b9186e08bc68c164b38 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.distributed as dist
@@ -124,3 +125,140 @@ class PPLXAll2AllManager(All2AllManagerBase):
             from pplx_kernels.nvshmem import nvshmem_finalize
             logger.debug("PPLX NVSHMEM finalize")
             nvshmem_finalize()
+
+
+class DeepEPAll2AllManagerBase(All2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        has_deepep = importlib.util.find_spec("deep_ep") is not None
+        assert has_deepep, "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."  # noqa
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+
+        # This is the DeepEP default. Stick to it till we can establish
+        # reasonable defaults based on profiling.
+        self.num_sms = 20
+
+    def get_handle(self, kwargs):
+        raise NotImplementedError
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(self) -> dict[Any, Any]:
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_rdma_bytes = None
+        num_qps_per_rank = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+            num_qps_per_rank = self.num_sms // 2
+        else:
+            num_rdma_bytes = 0
+            num_qps_per_rank = 1
+
+        assert num_rdma_bytes is not None
+        assert num_qps_per_rank is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=False,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+
+        assert len(kwargs) == 0, (
+            "DeepEPHTAll2AllManager expects no arguments. All the required "
+            "args are computed in the Manager itself.")
+
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs()
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
+
+
+class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP Low-Latency kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(
+        self,
+        max_num_tokens_per_dp_rank: int,
+        token_hidden_size: int,
+        num_ep_ranks: int,
+        num_global_experts: int,
+        num_local_experts: int,
+    ) -> dict[Any, Any]:
+        """
+        max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank
+          can dispatch all the ranks must hold the same value.
+        token_hidden_size: the hidden dimension of each token.
+        num_ep_ranks: the number of EP group ranks.
+        num_global_experts: Number of experts in the model.
+        num_local_experts: Number of experts in an EP rank.
+        """
+        import deep_ep
+
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_qps_per_rank = num_local_experts
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+            hidden=token_hidden_size,
+            num_ranks=num_ep_ranks,
+            num_experts=num_global_experts)
+
+        assert num_rdma_bytes is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=True,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+        """
+        The kwargs for DeepEPLLAll2AllManager is dictated by
+        _make_all2all_kwargs.
+        """
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs(**kwargs)
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 52b970949144f5513430531d11840b12a02a71c0..1bc2d8e0281c75d8253ff20fc2e9becd6408eae7 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
 from typing import Optional
 from weakref import WeakValueDictionary
@@ -48,8 +49,7 @@ class All2AllManagerBase:
 
         # all2all communication often has separate implementations for
         # intra-node and inter-node communication
-        self.intranode = in_the_same_node_as(cpu_group, source_rank=0)
-        self.internode = not self.intranode
+        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index c04218cb9f3945e2cd919cb6c4adcb6f4cd7e556..94effa0b2ca88adaa7238ab56f3d2cb46719d7aa 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index a05a13f51d4bcec4daae7dcdfffa1bda36548bda..055d91690e67652b056d80457b2ab573c79aac95 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
@@ -66,6 +67,14 @@ class CudaCommunicator(DeviceCommunicatorBase):
                 from .all2all import PPLXAll2AllManager
                 self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
                 logger.info("Using PPLX all2all manager.")
+            elif all2all_backend == "deepep_high_throughput":
+                from .all2all import DeepEPHTAll2AllManager
+                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP High-Throughput all2all manager.")
+            elif all2all_backend == "deepep_low_latency":
+                from .all2all import DeepEPLLAll2AllManager
+                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP Low-Latency all2all manager.")
             else:
                 raise ValueError(f"Unknown all2all backend: {all2all_backend}")
 
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 6c15ef644b8c2ed98ef48c575e6504f514ef7804..2c38e8ed21d7df2e93efb0cf17753f6628bcdde6 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 3c117eda8128667397e6dd3b5ddf3f81b25ba59c..585b7330aa9a985ae13564f223e5e126d61edd57 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Optional, Union
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 11b8b57fe2aed4e1411945dfcb96a2242241846e..7c6001e870392292b962a48b37be029e6e1a623a 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ctypes
 import json
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
index 9536a7f883e1bdb1a2c9c09b2985d8ae03e4c087..f00f6b62bf24abd2700a1a90bc7dd0058d94494c 100644
--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.distributed as dist
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
index dfa4b5194bdbe1a7060d6222d166f35d65f37125..5b61a1687a0168390af6bb62b1173fc24fd0f1b7 100644
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 from vllm.distributed.device_communicators.base_device_communicator import (
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0ccd423121cb0d6ccb9621eb9f2b464bee6bb057..29486292996adc25f685ab5e9dc6ce5994555233 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 6f69089b61968c949e2ad26245b9e08486af0260..04a4d0147f5d8b9ba31ac6e30b9b5a5c1e284ffe 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # This file is a pure Python wrapper for the NCCL library.
 # The main purpose is to use NCCL combined with CUDA graph.
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 40e57e6624d1eb90a9982fb8edbf370f2cad59ae..c7810043b81e886096501ccf4479055464148385 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 import time
@@ -27,6 +28,43 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 logger = init_logger(__name__)
 
 
+class SpinTimer:
+
+    def record_activity(self):
+        pass
+
+    def spin(self):
+        sched_yield()
+
+
+class SpinSleepTimer(SpinTimer):
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when vllm does nothing. This would lead to more
+    CPU thermal headroom when a request eventually comes, especially when
+    multiple GPUs are connected as each GPU would otherwise pin one thread at
+    100% CPU usage.
+
+    The simplest solution is to reduce polling frequency when there is no
+    activity for a certain period of time.
+    """
+
+    def __init__(self, busy_loop_s: float = 3.0, wait_sleep_s: float = 0.1):
+        self.last_activity = time.monotonic()
+        self.busy_loop_s = busy_loop_s
+        self.wait_sleep_s = wait_sleep_s
+
+    def record_activity(self):
+        self.last_activity = time.monotonic()
+
+    def spin(self):
+        curr_time = time.monotonic()
+        if curr_time >= self.last_activity + self.busy_loop_s:
+            time.sleep(self.wait_sleep_s)
+        else:
+            sched_yield()
+
+
 class ShmRingBuffer:
 
     def __init__(self,
@@ -41,7 +79,7 @@ class ShmRingBuffer:
         of items that can be stored in the buffer are known in advance.
         In this case, we don't need to synchronize the access to
          the buffer.
-        
+
         Buffer memory layout:
                   data                                 metadata
                     |                                      |
@@ -237,6 +275,7 @@ class MessageQueue:
         self.local_reader_rank = -1
         # rank does not matter for remote readers
         self._is_remote_reader = False
+        self._read_spin_timer = SpinTimer()
 
         self.handle = Handle(
             local_reader_ranks=local_reader_ranks,
@@ -275,6 +314,9 @@ class MessageQueue:
             self.local_socket.connect(socket_addr)
 
             self.remote_socket = None
+
+            self._read_spin_timer = SpinSleepTimer(
+            ) if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
         else:
             self.buffer = None  # type: ignore
             self.current_idx = -1
@@ -406,7 +448,7 @@ class MessageQueue:
                     # we need to wait until it is written
 
                     # Release the processor to other threads
-                    sched_yield()
+                    self._read_spin_timer.spin()
 
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
@@ -437,6 +479,8 @@ class MessageQueue:
                 metadata_buffer[self.local_reader_rank + 1] = 1
                 self.current_idx = (self.current_idx +
                                     1) % self.buffer.max_chunks
+
+                self._read_spin_timer.record_activity()
                 break
 
     def enqueue(self, obj, timeout: Optional[float] = None):
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index a1775279661d162eab119ff3bcb3366a3314b4b3..c60a7a7eb25cf88c41a7dcb92d498ced35d6f2ae 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index 256e7965e0a7241c96aca0a1c4fb3a53e505fdfe..216ff85c8bb7e8a2724d5c166f30e1d02fbcd695 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 29c6a70c4d26fd4bec3f0f30de50bfd6d88903f2..2d7935773dd9f03ed2e8033680c7dce97b6c2290 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import queue
 import threading
@@ -27,6 +28,7 @@ class EventBatch(
 ):
     ts: float
     events: list[Any]
+    data_parallel_rank: Optional[int] = None
 
 
 class KVCacheEvent(
@@ -59,7 +61,22 @@ class KVEventBatch(EventBatch):
 
 
 class EventPublisher(ABC):
-    """Lightweight publisher for EventBatch batches."""
+    """Lightweight publisher for EventBatch batches with data parallelism
+    support.
+    
+    In data parallel setups, each DP rank runs its own EventPublisher instance
+    to avoid duplicate events and ensure proper event attribution:
+    
+    - Each DP rank creates a separate publisher
+    - Publishers automatically annotate events with their data_parallel_rank
+    - This allows consumers to distinguish events from different DP ranks
+    
+    The publisher is responsible for adding DP metadata since the scheduler
+    operates independently of DP topology and shouldn't need DP awareness.
+    """
+
+    def __init__(self, data_parallel_rank: int = 0) -> None:
+        self._data_parallel_rank = data_parallel_rank
 
     @abstractmethod
     def publish(self, events: EventBatch) -> None:
@@ -112,6 +129,7 @@ class ZmqEventPublisher(EventPublisher):
 
     def __init__(
         self,
+        data_parallel_rank: int,
         endpoint: str = "tcp://*:5557",
         replay_endpoint: Optional[str] = None,
         buffer_steps: int = 10_000,
@@ -120,6 +138,7 @@ class ZmqEventPublisher(EventPublisher):
         topic: str = "",
     ) -> None:
         # Storage
+        super().__init__(data_parallel_rank)
         self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
         self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
 
@@ -127,8 +146,11 @@ class ZmqEventPublisher(EventPublisher):
         self._ctx = zmq.Context.instance()
         self._pub: Optional[zmq.Socket] = None
         self._replay: Optional[zmq.Socket] = None
-        self._endpoint = endpoint
-        self._replay_endpoint = replay_endpoint
+        self._dp_rank = data_parallel_rank
+
+        self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
+        self._replay_endpoint = self.offset_endpoint_port(
+            replay_endpoint, self._dp_rank)
         self._hwm = hwm
         self._socket_setup()
 
@@ -148,6 +170,8 @@ class ZmqEventPublisher(EventPublisher):
     def publish(self, events: EventBatch) -> None:
         if not self._running:
             raise RuntimeError("Publisher is closed")
+        if events.data_parallel_rank is None:
+            events.data_parallel_rank = self._data_parallel_rank
         self._event_queue.put(events)
 
     def shutdown(self) -> None:
@@ -190,11 +214,12 @@ class ZmqEventPublisher(EventPublisher):
             self._pub.set_hwm(self._hwm)
             # Heuristic: bind if wildcard / * present, else connect.
             # bind stable, connect volatile convention
-            if ("*" in self._endpoint or "::" in self._endpoint
-                    or self._endpoint.startswith("ipc://")
-                    or self._endpoint.startswith("inproc://")):
+            if (self._endpoint is not None
+                    and ("*" in self._endpoint or "::" in self._endpoint
+                         or self._endpoint.startswith("ipc://")
+                         or self._endpoint.startswith("inproc://"))):
                 self._pub.bind(self._endpoint)
-            else:
+            elif self._endpoint is not None:
                 self._pub.connect(self._endpoint)
 
         # Set up replay socket: use ROUTER
@@ -265,6 +290,38 @@ class ZmqEventPublisher(EventPublisher):
         # receiving payload is (-1, b""")
         self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
 
+    @staticmethod
+    def offset_endpoint_port(endpoint: Optional[str],
+                             data_parallel_rank: int) -> Optional[str]:
+        """Helper function to offset the port in an endpoint by 
+            the data parallel rank.
+
+        Args:
+            endpoint: The endpoint string 
+                (e.g., "tcp://*:5557" or "inproc://cache")
+            data_parallel_rank: The data parallel rank to offset by
+
+        Returns:
+            The endpoint with the port offset by data_parallel_rank 
+                or suffix appended
+        """
+        # Do nothing if input is None or data_parallel_rank is 0
+        if not endpoint or data_parallel_rank == 0:
+            return endpoint
+
+        if "inproc" in endpoint:
+            return f"{endpoint}_dp{data_parallel_rank}"
+        if "tcp" in endpoint:
+            if endpoint and ":" in endpoint:
+                # Get everything after the last colon (the port)
+                last_colon_idx = endpoint.rfind(":")
+                base_addr = endpoint[:last_colon_idx]
+                base_port = int(endpoint[last_colon_idx + 1:])
+                new_port = base_port + data_parallel_rank
+                return f"{base_addr}:{new_port}"
+            return endpoint
+        raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'")
+
 
 class EventPublisherFactory:
     _registry: dict[str, Callable[..., EventPublisher]] = {
@@ -280,7 +337,9 @@ class EventPublisherFactory:
         cls._registry[name] = ctor
 
     @classmethod
-    def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher:
+    def create(cls,
+               config: Optional[KVEventsConfig],
+               data_parallel_rank: int = 0) -> EventPublisher:
         """Create publisher from a config mapping."""
         if not config:
             return NullEventPublisher()
@@ -293,4 +352,5 @@ class EventPublisherFactory:
             constructor = cls._registry[kind]
         except KeyError as exc:
             raise ValueError(f"Unknown event publisher '{kind}'") from exc
-        return constructor(**config_dict)
+        return constructor(data_parallel_rank=data_parallel_rank,
+                           **config_dict)
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index 8b6abf5a80dd0ead2f2d8f2b06305aa1638eefeb..fa9b7e4f14c020ccd6e12863173b6e81b318837d 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index e9b70610e8cdfe534c202fd6bc32ae79d9794f0e..181c33925da76fed9b66f1406feb20ac7615f81e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KVConnectorBase Class for Distributed KV Cache & Hidden State communication
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 06b3983ed68bda3c9c60b2d5b1d80128f9c287e6..58dfa251c735d60277ddb838714c10c058d1e16b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from typing import TYPE_CHECKING, Callable
@@ -70,7 +71,8 @@ class KVConnectorFactory:
             connector_module = importlib.import_module(connector_module_path)
             connector_cls = getattr(connector_module, connector_name)
         assert issubclass(connector_cls, KVConnectorBase_V1)
-        logger.info("Creating v1 connector with name: %s", connector_name)
+        logger.info("Creating v1 connector with name: %s and engine_id: %s",
+                    connector_name, kv_transfer_config.engine_id)
         # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
         # Scheduler connector:
         # - Co-locate with scheduler process
diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
index d121cb701bef317cb61e977af6a246d5f05201a0..78bf3095613a729a2602ed9bc31dbe95862c9a7a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LMCache KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
index 58eabd0a37ebb02309027c9c76295e3e4ff91aef..94a7ce91acf172e4d1beca45631510cc333d421c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 MooncakeStore Connector for Distributed Machine Learning Inference
 The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index ed8fe38161e971f40518562ed9e942e8823fa5ef..e7c079e1f115cd0cd626d75440541db170fe9549 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b1c9c9af6e23540d17de960392d22ea7bf71523f..b9bed06d791c5202f748f283ebbc634a3b3dffed 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KV cache helper for store.
 """
+
 import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -89,3 +91,18 @@ class model_aware_kv_ops_helper:
                 layer.self_attn.attn._k_scale,
                 layer.self_attn.attn._v_scale,
             )
+
+
+def get_kv_connector_cache_layout():
+    vllm_config = get_current_vllm_config()
+    kv_config = vllm_config.kv_transfer_config
+    if vllm_config.model_config is None:
+        logger.warning("Unable to detect current VLLM config. " \
+        "Defaulting to NHD kv cache layout.")
+    else:
+        use_mla = vllm_config.model_config.use_mla
+        if not use_mla and kv_config.kv_connector == "NixlConnector":
+            logger.info("NixlConnector detected. Setting KV cache " \
+            "layout to HND for better xfer performance.")
+            return "HND"
+    return "NHD"
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
index e66aaa7f8af8ef38080a358fd3fe60d7c3530cf5..f00f31dde915ad0527965a72e9332ec8fb0bd3a9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorRole)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index bc9258e9d07b6fd57e54fbcab848325a8849d980..f80b5eba235ddf2c67aecff9bbd62aa1fcd19f95 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
 communication in vLLM v1
@@ -7,9 +8,15 @@ The class provides the following primitives:
     Scheduler-side: runs in the scheduler, binds metadata, which
     is used by the worker-side to load/save KV cache.
         get_num_new_matched_tokens() - get number of new tokens 
-            that exist in the remote KV cache
+            that exist in the remote KV cache. Might be called multiple
+            times for a given request and should be side-effect free.
         update_state_after_alloc() - update KVConnector state after
             temporary buffer alloc by the CacheManager.
+        request_finished() - called when a request is finished, with
+            the computed kv cache blocks for the request.
+            Returns whether KV cache should be freed now or will be
+            freed asynchronously and optionally returns KV transfer
+            params.
 
     Worker-side: runs in each worker, loads/saves KV cache to/from
     the Connector based on the metadata.
@@ -18,6 +25,9 @@ The class provides the following primitives:
 
         save_kv_layer() - starts saving KV for layer i (maybe async)
         wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
 """
 
 import enum
@@ -183,7 +193,8 @@ class KVConnectorBase_V1(ABC):
         finished generating tokens.
 
         Returns:
-            ids of requests that have finished asynchronous transfer,
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
             tuple of (sending/saving ids, recving/loading ids).
             The finished saves/sends req ids must belong to a set provided in a
             call to this method (this call or a prior one).
@@ -214,7 +225,8 @@ class KVConnectorBase_V1(ABC):
                 - The number of tokens that can be loaded from the 
                   external KV cache beyond what is already computed.
                 - `True` if external KV cache tokens will be loaded
-                  asynchronously (between scheduler steps).
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
         """
         pass
 
@@ -224,6 +236,18 @@ class KVConnectorBase_V1(ABC):
                                  num_external_tokens: int):
         """
         Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
         """
         pass
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 2cb68dc1ff6751493d4050572523f81bf6818ca1..cc1f4ba356428d225a0d581e02d8fffef78a47c0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING
 
 import torch
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 0aabb260fd3dce9a81fb82152927370d8c4fa9bb..be3c233994199823c9bc671296bb4a680799c268 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
@@ -11,12 +12,12 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
-    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -50,8 +51,9 @@ class MultiConnector(KVConnectorBase_V1):
             self._connectors.append(
                 KVConnectorFactory.create_connector_v1(temp_config, role))
 
-        # A mapping from request id to the connector that is assigned to it.
-        self._requests_to_connector: dict[str, KVConnectorBase_V1] = {}
+        # A mapping from request id to the index of the connector chosen to
+        # load the request from (if any).
+        self._requests_to_connector: dict[str, int] = {}
 
         # Keeps track of *additional* remaining async saves (beyond 1) to be
         # finished per request. Not needed for async loads since we only allow
@@ -135,25 +137,31 @@ class MultiConnector(KVConnectorBase_V1):
         request: "Request",
         num_computed_tokens: int,
     ) -> tuple[int, bool]:
-        for c in self._connectors:
+        to_return = (0, False)
+        for i, c in enumerate(self._connectors):
             toks, load_async = c.get_num_new_matched_tokens(
                 request, num_computed_tokens)
             # The first connector that has new matched tokens will be assigned
             # to this request.
-            if toks > 0:
-                self._requests_to_connector[request.request_id] = c
-                return toks, load_async
-        return 0, False
+            if to_return[0] == 0 and toks > 0:
+                self._requests_to_connector[request.request_id] = i
+                to_return = (toks, load_async)
+        return to_return
 
     def update_state_after_alloc(self, request: "Request",
                                  blocks: "KVCacheBlocks",
                                  num_external_tokens: int):
-        # If the request is not assigned to any connector, we do nothing.
-        if request.request_id not in self._requests_to_connector:
-            return
-        # We assume that the request is assigned to only one connector.
-        c = self._requests_to_connector.pop(request.request_id)
-        c.update_state_after_alloc(request, blocks, num_external_tokens)
+        chosen_connector = self._requests_to_connector.get(
+            request.request_id, -1)
+        empty_blocks = blocks.new_empty()
+        for i, c in enumerate(self._connectors):
+            if i == chosen_connector:
+                # Forward call to the chosen connector (if any).
+                c.update_state_after_alloc(request, blocks,
+                                           num_external_tokens)
+            else:
+                # Call with empty blocks for other connectors.
+                c.update_state_after_alloc(request, empty_blocks, 0)
 
     def build_connector_meta(
             self,
@@ -169,7 +177,7 @@ class MultiConnector(KVConnectorBase_V1):
     def request_finished(
         self,
         request: "Request",
-        blocks: "KVCacheBlocks",
+        blocks: list[int],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         async_saves = 0
         kv_txfer_params = None
@@ -186,4 +194,8 @@ class MultiConnector(KVConnectorBase_V1):
                 kv_txfer_params = txfer_params
         if async_saves > 1:
             self._extra_async_saves[request.request_id] = async_saves - 1
+
+        # Clean up other state for this request.
+        self._requests_to_connector.pop(request.request_id, None)
+
         return async_saves > 0, kv_txfer_params
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 6303d77ad30559c071af314fbc19e329d24477ab..7552fc889f2f1a97406fa91ac5cc7178ba9eadb0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import math
 import threading
@@ -14,6 +15,7 @@ import torch
 import zmq
 
 from vllm import envs
+from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
@@ -21,6 +23,7 @@ from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
     get_tp_group)
 from vllm.logger import init_logger
+from vllm.platforms import _Backend
 from vllm.utils import make_zmq_path, make_zmq_socket, round_down
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
@@ -31,6 +34,7 @@ if TYPE_CHECKING:
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
+Transfer = tuple[int, float]  # (xfer_handle, start_time)
 GET_META_MSG = b"get_meta_msg"
 
 logger = init_logger(__name__)
@@ -53,6 +57,9 @@ class NixlAgentMetadata(
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
     num_blocks: int
+    tp_size: int
+    block_len: int
+    attn_backend_name: str
 
 
 @dataclass
@@ -172,6 +179,11 @@ class NixlConnectorScheduler:
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
         self.engine_id = engine_id
+        self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
+        self.side_channel_port = (
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
+            vllm_config.parallel_config.data_parallel_rank_local *
+            vllm_config.parallel_config.tensor_parallel_size)
         logger.info("Initializing NIXL Scheduler %s", engine_id)
 
         # Requests that need to start recv.
@@ -212,15 +224,6 @@ class NixlConnectorScheduler:
             if count > 0:
                 return count, True
 
-            # NOTE: if count is 0 here, we have less than block_size
-            # tokens to pull after subtracting the local prefix cache hit.
-            # The remote only sends fully computed blocks, so there is
-            # nothing to transfer but we still need to notify the
-            # prefill worker so that the remote blocks are freed.
-            if all(p in params for p in ("remote_engine_id", "remote_host",
-                                         "remote_port")):
-                self._reqs_need_recv[request.request_id] = (request, [])
-
         # No remote prefill for this request.
         return 0, False
 
@@ -238,9 +241,14 @@ class NixlConnectorScheduler:
             if params.get("remote_block_ids"):
                 if all(p in params for p in ("remote_engine_id", "remote_host",
                                              "remote_port")):
+                    # If remote_blocks and num_external_tokens = 0, we have
+                    # a full prefix cache hit on the D worker. We need to call
+                    # send_notif in _read_blocks to free the memory on the P.
+                    local_block_ids = (blocks.get_unhashed_block_ids()
+                                       if num_external_tokens > 0 else [])
                     # Get unhashed blocks to pull from remote.
                     self._reqs_need_recv[request.request_id] = (
-                        request, blocks.get_unhashed_block_ids())
+                        request, local_block_ids)
                 else:
                     logger.warning(
                         "Got invalid KVTransferParams: %s. This "
@@ -259,15 +267,6 @@ class NixlConnectorScheduler:
         # Loop through scheduled reqs and convert to ReqMeta.
         for req_id, (req, block_ids) in self._reqs_need_recv.items():
             assert req.kv_transfer_params is not None
-            # For the case where there are no remote blocks to pull
-            # (block_ids is empty), we don't need to schedule
-            # an async read on the worker side.
-            if not block_ids:
-                logger.debug(
-                    "Skipping adding request %s to NixlConnectorMetadata, "
-                    "as there are no remote blocks to pull", req_id)
-                continue
-
             meta.add_new_req(
                 request_id=req_id,
                 local_block_ids=block_ids,
@@ -310,8 +309,8 @@ class NixlConnectorScheduler:
             do_remote_decode=False,
             remote_block_ids=computed_block_ids,
             remote_engine_id=self.engine_id,
-            remote_host=envs.VLLM_NIXL_SIDE_CHANNEL_HOST,
-            remote_port=envs.VLLM_NIXL_SIDE_CHANNEL_PORT,
+            remote_host=self.side_channel_host,
+            remote_port=self.side_channel_port,
         )
 
 
@@ -325,21 +324,35 @@ class NixlConnectorWorker:
         logger.info("Initializing NIXL wrapper")
         logger.info("Initializing NIXL worker %s", engine_id)
 
+        # Config.
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
         # Agent.
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
-        # Map of engine_id -> agent_name.
-        self._remote_agents: dict[str, str] = {}
+        # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
+        self._remote_agents: dict[str, dict[int, str]] = defaultdict(dict)
+
+        # NIXL handshake port.
+        # NOTE(rob): Within a DP group, each DP rank gets its own
+        # base port (which is sent in the KVTransferParams).
+        # Each TP rank listens/queries on the base_port + tp_rank.
+        self.side_channel_port = (
+            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
+            vllm_config.parallel_config.data_parallel_rank_local *
+            vllm_config.parallel_config.tensor_parallel_size)
 
         # Metadata.
         self.engine_id = engine_id
-        self.rank = get_tensor_model_parallel_rank()
+        self.tp_rank = get_tensor_model_parallel_rank()
         self.world_size = get_tensor_model_parallel_world_size()
         self.tp_group = get_tp_group()
 
         # KV Caches and nixl tracking data.
         self.kv_caches: dict[str, torch.Tensor] = {}
 
-        # Map of engine_id -> kv_caches_base_addr
+        # Map of engine_id -> kv_caches_base_addr. For TP case, each local
+        # rank will still only pull from a single remote TP worker.
         self.kv_caches_base_addr: dict[str, list[int]] = {}
 
         # Number of NIXL regions. Currently one region per cache
@@ -347,19 +360,19 @@ class NixlConnectorWorker:
         self.num_regions = 0
         self.num_layers = 0
 
-        # nixl_prepped_dlist_handle (int).
+        # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[str, int] = {}
 
-        # Map of engine_id -> num_blocks.
+        # Map of engine_id -> num_blocks. All ranks in the same deployment will
+        # have the same number of blocks.
         self.dst_num_blocks: dict[str, int] = {}
         self._registered_descs: list[Any] = []
 
         # In progress transfers.
         # [req_id -> list[handle]]
-        self._recving_transfers: defaultdict[str, list[Any]] = defaultdict(
-            list[Any])
+        self._recving_transfers = defaultdict[str, list[Transfer]](list)
 
         # Complete transfer tracker. Used by the rank 0 to track finished
         # transactions on ranks 1 to N-1.
@@ -374,23 +387,38 @@ class NixlConnectorWorker:
 
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
 
         # TODO(mgoin): remove this once we have hybrid memory allocator
         # Optimization for models with local attention (Llama 4)
         # List of block window sizes for each layer for local attention
         self.block_window_per_layer: list[Optional[int]] = []
+        self.use_mla = self.model_config.use_mla
+
+        backend = get_attn_backend(self.model_config.get_head_size(),
+                                   self.model_config.dtype,
+                                   self.cache_config.cache_dtype,
+                                   self.block_size,
+                                   self.model_config.is_attention_free,
+                                   use_mla=self.use_mla)
+        self.backend_name = backend.get_name()
+        attn_backend = backend_name_to_enum(self.backend_name)
+        self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
+        logger.debug("Detected attention backend %s", self.backend_name)
+
+        self._tp_size: dict[str, int] = {self.engine_id: self.world_size}
+        # With heterogeneous TP, P must wait for all assigned D TP workers to
+        # finish reading before safely freeing the blocks.
+        self.consumer_notification_counts_by_req = defaultdict[str, int](int)
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
-                                 ready_event: threading.Event, rank: int):
+                                 ready_event: threading.Event, base_port: int,
+                                 tp_rank: int):
         """Background thread for getting new NIXL handshakes."""
         # NOTE(rob): this is a simple implementation. We will move
-        # to a better approach like an ETCD server in the future.
-
-        # NOTE(rob): to support heterogeneous TP, we will have to
-        # move this into the scheduler rather than worker, since
-        # each rank needs the metadata of all other ranks (whereas
-        # in this setup, each rank only gets one other rank's meta.
+        # to a better approach via HTTP endpoint soon.
 
         encoder = msgspec.msgpack.Encoder()
         encoded_data = encoder.encode(metadata)
@@ -400,11 +428,7 @@ class NixlConnectorWorker:
 
         # Listen for new requests for metadata.
         host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
-        # NOTE(rob): we need each rank to have a unique port. This
-        # hack to keeps us moving. We will switch when moving to etcd
-        # or where we have a single ZMQ socket in the scheduler.
-        port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + rank
-        path = make_zmq_path("tcp", host, port)
+        path = make_zmq_path("tcp", host, base_port + tp_rank)
         logger.debug("Starting listening on path: %s", path)
         with zmq_ctx(zmq.ROUTER, path) as sock:
             ready_event.set()
@@ -419,27 +443,44 @@ class NixlConnectorWorker:
         """Do a NIXL handshake with a remote instance."""
 
         start_time = time.perf_counter()
+
         # NOTE(rob): we need each rank to have a unique port. This is
         # a hack to keep us moving. We will switch when moving to etcd
         # or where we have a single ZMQ socket in the scheduler.
-        path = make_zmq_path("tcp", host, port + self.rank)
-        logger.debug("Querying metadata on path: %s", path)
-        with zmq_ctx(zmq.REQ, path) as sock:
-            # Send query for the request.
-            sock.send(GET_META_MSG)
-            metadata_bytes = sock.recv()
-            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            metadata = decoder.decode(metadata_bytes)
-            got_metadata_time = time.perf_counter()
 
-            # Register Remote agent.
-            self.add_remote_agent(metadata)
-            setup_agent_time = time.perf_counter()
-
-            logger.debug("NIXL handshake: get metadata took: %s",
-                         got_metadata_time - start_time)
-            logger.debug("NIXL handshake: add agent took: %s",
-                         setup_agent_time - got_metadata_time)
+        def handshake(path: str, rank: int) -> NixlAgentMetadata:
+            # Send query for the request.
+            with zmq_ctx(zmq.REQ, path) as sock:
+                sock.send(GET_META_MSG)
+                metadata_bytes = sock.recv()
+                decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                metadata = decoder.decode(metadata_bytes)
+                got_metadata_time = time.perf_counter()
+
+                # Register Remote agent.
+                self.add_remote_agent(metadata, rank)
+                setup_agent_time = time.perf_counter()
+
+                logger.debug("NIXL handshake: get metadata took: %s",
+                             got_metadata_time - start_time)
+                logger.debug("NIXL handshake: add agent took: %s",
+                             setup_agent_time - got_metadata_time)
+                return metadata
+
+        # Handshake with remote agent-rank0 first to get the tp_size of remote
+        path = make_zmq_path("tcp", host, port)
+        logger.debug("Querying master rank metadata on path: %s", path)
+        metadata = handshake(path, 0)
+
+        # Handshake only with the other TP remote the current local rank will
+        # pull from. With homogeneous TP it happens to be the same rank_i.
+        tp_ratio = self._tp_size[self.engine_id] // metadata.tp_size
+        p_remote_rank = self.tp_rank // tp_ratio
+        if p_remote_rank > 0:
+            path = make_zmq_path("tcp", host, port + p_remote_rank)
+            logger.debug("Querying metadata on path: %s at remote rank %s",
+                         path, p_remote_rank)
+            _ = handshake(path, p_remote_rank)
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
@@ -448,27 +489,44 @@ class NixlConnectorWorker:
         kv_elem_size = first_kv_cache.element_size()
 
         # TODO(tms): Find a more robust way to detect and handle MLA
+        # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
+        # KV memory layout is HND, as opposed to the default NHD. Note that it
+        # will only affects the strides. For MLA instead, we make require no
+        # such thing and resort to the standard layout.
         use_mla = len(first_kv_cache.shape) == 3
+        assert use_mla == self.use_mla
+
+        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
+        # once it goes live, as a single kv layout is expected for xfers.
         if use_mla:
             # MLA case.
             self.num_blocks = first_kv_cache.shape[0]
             block_rank = 2  # [block_size, latent_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, kv_latent_dim = block_shape
+            self.slot_size_bytes = kv_elem_size * kv_latent_dim
         else:
             # [2 (k and v), num_blocks, ...]
-            self.num_blocks = first_kv_cache.shape[1]
-            block_rank = 3  # [block_size, kv_heads, head_dim]
+            if self._use_flashinfer:
+                # FlashInfer swaps 2<->num_blocks dimensions.
+                self.num_blocks = first_kv_cache.shape[0]
+                block_rank = 4  # [2, block_size, kv_heads, head_dim]
+            else:
+                self.num_blocks = first_kv_cache.shape[1]
+                block_rank = 3  # [block_size, kv_heads, head_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-
+            block_size, n_kv_heads, head_dim = block_shape[-3:]
+            # head size in bytes.
+            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
+        assert block_size == self.block_size
         # TODO(tms): self.block_len needs to be per-layer for sliding window,
         # hybrid attn, etc
+        # block size in bytes
         self.block_len = kv_elem_size * math.prod(block_shape)
-
-        logger.debug("Registering KV_Caches. use_mla: %s, shape %s", use_mla,
-                     first_kv_cache.shape)
-        logger.debug("num_blocks: %s, block_shape: %s", self.num_blocks,
-                     block_shape)
-        logger.debug("Per layer kv cache size: %s", first_kv_cache.shape)
+        logger.info(
+            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
+            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
+            self.num_blocks, block_shape, first_kv_cache.shape)
         self.dst_num_blocks[self.engine_id] = self.num_blocks
         self.kv_caches = kv_caches
         kv_caches_base_addr = []
@@ -480,13 +538,17 @@ class NixlConnectorWorker:
         # are non-contiguous (it's not locally guaranteed that they will be)
         # Disadvantage is that the encoded NixlAgentMetadata is now larger
         # (roughly 8KB vs 5KB).
+        # Conversely for FlashInfer, K and V are transferred in the same tensor
+        # to better exploit the memory layout (ie num_blocks is the first dim).
         for cache_or_caches in kv_caches.values():
             # Normalize to always be a list of caches
-            cache_list = [cache_or_caches] if use_mla else cache_or_caches
+            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
+                else cache_or_caches
             for cache in cache_list:
                 base_addr = cache.data_ptr()
                 region_len = self.num_blocks * self.block_len
-                caches_data.append((base_addr, region_len, self.rank, ""))
+                caches_data.append(
+                    (base_addr, region_len, cache.device.index, ""))
                 kv_caches_base_addr.append(base_addr)
         self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
         self.num_regions = len(caches_data)
@@ -516,69 +578,171 @@ class NixlConnectorWorker:
         logger.debug("Registering descs: %s", caches_data)
         self.nixl_wrapper.register_memory(descs)
         logger.debug("Done registering descs")
-
         self._registered_descs.append(descs)
 
+        # Register local/src descr for NIXL xfer.
+        blocks_data = []
+        for base_addr in self.kv_caches_base_addr[self.engine_id]:
+            # NOTE With heter-TP, more blocks are prepared than what are
+            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
+            # could create fewer, but then _get_block_descs_ids needs to
+            # select agent_meta.num_blocks instead of self.num_blocks for
+            # local descr, and that makes handling regular flow less clean.
+            for block_id in range(self.num_blocks):
+                block_offset = block_id * self.block_len
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, self.block_len, self.tp_rank))
+        logger.debug("Created %s blocks for src engine %s and rank %s",
+                     len(blocks_data), self.engine_id, self.tp_rank)
+
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
+            "NIXL_INIT_AGENT", descs)
+
         # After KV Caches registered, listen for new connections.
         metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
-        )
+            tp_size=self.world_size,
+            block_len=self.block_len,
+            attn_backend_name=self.backend_name)
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
-            args=(metadata, ready_event, self.rank),
+            args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
             daemon=True,
             name="nixl_handshake_listener")
         self._nixl_handshake_listener_t.start()
         ready_event.wait()
 
-    def add_remote_agent(self, nixl_agent_meta: NixlAgentMetadata):
+    def add_remote_agent(self,
+                         nixl_agent_meta: NixlAgentMetadata,
+                         remote_tp_rank: int = 0):
+        """
+        Add the remote NIXL agent and prepare the descriptors for reading cache
+        blocks from remote.
+
+        In particular, handle both homogeneous and heterogeneous TP. The former
+        requires local rank_i to read from remote rank_i. 
+        The latter, assuming D.world_size > P.world_size, requires that two or 
+        more local TP worker share the xfer from a single TP worker.
+
+        Here's an example:
+
+        rank_offset     p_remote_tp_rank
+        (kv split no)    
+        --------------------------------
+            0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
+                                                                        /
+            1                 0      Worker1  ---- 2nd half of KV -----/
+
+            0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
+                                                                        /
+            1                 1      Worker3  ---- 2nd half of KV -----/
+
+
+                                Decoder TP workers                     Prefix TP workers
+                                  (world_size=4)                         (world_size=2)
+                                                 tp_ratio = 4 // 2 = 2                  
+                                
+        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
+        then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
+        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
+        first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
+        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   
+        
+        Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
+
+        Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
+        so that the whole cache is shared by "tp_ratio" D TP workers.
+        """ # noqa: E501
         engine_id = nixl_agent_meta.engine_id
-        assert engine_id != self.engine_id, "Conflict engine id found!"
-        if engine_id in self._remote_agents:
+        # TODO re-evaluate refreshing for scaling/recovery
+        if remote_tp_rank in self._remote_agents.get(engine_id, ()):
             return
 
-        self._remote_agents[engine_id] = self.nixl_wrapper.add_remote_agent(
-            nixl_agent_meta.agent_metadata)
-        self.kv_caches_base_addr[
-            engine_id] = nixl_agent_meta.kv_caches_base_addr
+        if engine_id in self._tp_size:
+            assert self._tp_size[engine_id] == nixl_agent_meta.tp_size
+        else:
+            self._tp_size[engine_id] = nixl_agent_meta.tp_size
+        # We may eventually enable this after asserting equality in cache
+        # layout and close outputs.
+        assert nixl_agent_meta.attn_backend_name == self.backend_name
+
+        self._remote_agents[engine_id][
+            remote_tp_rank] = self.nixl_wrapper.add_remote_agent(
+                nixl_agent_meta.agent_metadata)
+
+        # Number of D TP workers reading from a single P TP worker. This is
+        # 1 when P and D `--tensor-parallel-size` match.
+        assert self._tp_size[self.engine_id] % self._tp_size[engine_id] == 0, (
+            "Local TP size must be divisible by remote TP size.")
+        tp_ratio = self._tp_size[self.engine_id] // self._tp_size[engine_id]
+        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
+        if self.use_mla:
+            # With MLA the only difference is in the number of blocks.
+            remote_block_size = nixl_agent_meta.block_len // (
+                self.slot_size_bytes)
+            assert self.block_len == nixl_agent_meta.block_len
+        else:
+            remote_block_size = nixl_agent_meta.block_len // (
+                self.slot_size_bytes * tp_ratio)
+            if self._use_flashinfer:
+                # Account for joint KV in FlashInfer.
+                remote_block_size //= 2
+
+            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
+                "Remote P worker KV layer cache must be of shape [2, N, "
+                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            )
 
-        # Create src descs and xfer side handles.
-        blocks_data = []
-        for base_addr in self.kv_caches_base_addr[self.engine_id]:
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len
-                # (addr, len, device id)
-                blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.rank))
-        logger.debug("Created %s blocks for src engine %s and rank %s",
-                     len(blocks_data), self.engine_id, self.rank)
+        assert self.block_size == remote_block_size, "Remote P worker with " \
+        "different block size is not supported"
 
-        # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs)
+        assert self.num_blocks >= nixl_agent_meta.num_blocks
+
+        # Create dst descs and xfer side handles. TP workers have same #blocks.
+        if engine_id in self.dst_num_blocks:
+            assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
+        else:
+            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
-        # Create dst descs and xfer side handles.
-        self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
         blocks_data = []
-        for base_addr in self.kv_caches_base_addr[engine_id]:
-            for block_id in range(nixl_agent_meta.num_blocks):
-                block_offset = block_id * self.block_len
-                # (addr, len, device id)
-                blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.rank))
-        logger.debug("Created %s blocks for dst engine %s and rank %s",
-                     len(blocks_data), engine_id, self.rank)
+        # With homogeneous TP, D pulls the whole kv cache from corresponding
+        # rank. With heterogeneous TP, prepare the descriptors by splitting the
+        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
+        # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
+        p_remote_tp_rank = self.tp_rank // tp_ratio
+        # Only register the remote's descriptors if current rank pulls from it.
+        if p_remote_tp_rank == remote_tp_rank:
+            self.kv_caches_base_addr[
+                engine_id] = nixl_agent_meta.kv_caches_base_addr
+            rank_offset = self.tp_rank % tp_ratio * self.block_len \
+                if not self.use_mla else 0
+            # Register all remote blocks, but only the corresponding kv heads.
+            for base_addr in nixl_agent_meta.kv_caches_base_addr:
+                for block_id in range(nixl_agent_meta.num_blocks):
+                    block_offset = block_id * nixl_agent_meta.block_len
+                    # For each block, grab the heads chunk belonging to rank_i
+                    # of size remote_nheads // tp_ratio, which correspond to
+                    # self.block_len == remote_block_len//tp_ratio bytes.
+                    addr = base_addr + block_offset + rank_offset
+                    # (addr, len, device id)
+                    blocks_data.append((addr, self.block_len, remote_tp_rank))
+            logger.debug(
+                "Created %s blocks for dst engine %s with remote rank %s and "
+                "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
+                self.tp_rank)
 
-        # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-        self.dst_xfer_side_handles[
-            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                self._remote_agents[engine_id], descs)
+            # Register with NIXL.
+            descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+            self.dst_xfer_side_handles[
+                engine_id] = self.nixl_wrapper.prep_xfer_dlist(
+                    self._remote_agents[engine_id][remote_tp_rank], descs)
 
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
@@ -597,14 +761,14 @@ class NixlConnectorWorker:
         if len(done_sending) > 0 or len(done_recving) > 0:
             logger.debug(
                 "Rank %s, get_finished: %s requests done sending "
-                "and %s requests done recving", self.rank, len(done_sending),
-                len(done_recving))
+                "and %s requests done recving", self.tp_rank,
+                len(done_sending), len(done_recving))
 
         if self.world_size == 1:
             return done_sending, done_recving
 
         # Rank 0: get finished from all other ranks.
-        if self.rank == 0:
+        if self.tp_rank == 0:
             for req_id in done_sending:
                 self._done_sending_count[req_id] += 1
             for req_id in done_recving:
@@ -646,16 +810,25 @@ class NixlConnectorWorker:
             return done_sending, done_recving
 
     def _get_new_notifs(self) -> set[str]:
-        """Get req_ids which got a remote xfer message."""
-
+        """
+        Get req_ids which got a remote xfer message. When multiple consumers
+        are reading from the same producer (heterogeneous TP scenario), wait
+        for all consumers to be done pulling.
+        """
         notified_req_ids: set[str] = set()
-        for req_ids in self.nixl_wrapper.get_new_notifs().values():
-            for req_id in req_ids:
-                assert req_id not in notified_req_ids
-                notified_req_ids.add(req_id.decode("utf-8"))
+        for notifs in self.nixl_wrapper.get_new_notifs().values():
+            for notif in notifs:
+                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                self.consumer_notification_counts_by_req[req_id] += 1
+                # Wait all consumers (D) to be done reading before freeing.
+                if self.consumer_notification_counts_by_req[req_id] == int(
+                        tp_ratio):
+                    notified_req_ids.add(req_id)
+                    del self.consumer_notification_counts_by_req[req_id]
         return notified_req_ids
 
-    def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
+    def _pop_done_transfers(
+            self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
         Args:
@@ -665,23 +838,17 @@ class NixlConnectorWorker:
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            running_reqs = []
-            for handle in handles:
+            for handle, xfer_stime in handles:
                 xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                 if xfer_state == "DONE":
-                    # TODO ptarasiewicz: why abort is throwing errors?
-                    # self.nixl_wrapper.release_xfer_handle(handle)
+                    self.nixl_wrapper.release_xfer_handle(handle)
+                    done_req_ids.add(req_id)
+                    del transfers[req_id]
+                elif xfer_state == "PROC":
                     continue
-                if xfer_state == "PROC":
-                    running_reqs.append(handle)
                 else:
                     raise RuntimeError("Transfer failed with state %s",
                                        xfer_state)
-            if len(running_reqs) == 0:
-                done_req_ids.add(req_id)
-                del transfers[req_id]
-            else:
-                transfers[req_id] = running_reqs
         return done_req_ids
 
     def start_load_kv(self, metadata: NixlConnectorMetadata):
@@ -727,12 +894,19 @@ class NixlConnectorWorker:
         # saturate IB with heterogeneous TP sizes. We should remove the staging
         # blocks until we are ready.
 
+        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # on notification so that dst worker can wait before freeing blocks.
+        tp_ratio = self._tp_size[
+            self.engine_id] // self._tp_size[dst_engine_id]
+        notif_id = f"{request_id}:{tp_ratio}".encode()
+
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            self.nixl_wrapper.send_notif(dst_engine_id,
-                                         notif_msg=request_id.encode("utf-8"))
+            remote_rank = self.tp_rank // tp_ratio
+            agent_name = self._remote_agents[dst_engine_id][remote_rank]
+            self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
             return
 
         # Partial prefix cache hit: just read uncomputed blocks.
@@ -745,6 +919,10 @@ class NixlConnectorWorker:
         local_xfer_side_handle = self.src_xfer_side_handle
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
+        # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
+        # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
+        # workers will issue xfers to parts of the P worker remote kv caches.
+
         # Get descs ids.
         local_block_descs_ids: list[int] = []
         remote_block_descs_ids: list[int] = []
@@ -788,14 +966,16 @@ class NixlConnectorWorker:
             local_block_descs_ids,
             remote_xfer_side_handle,
             remote_block_descs_ids,
-            notif_msg=request_id.encode("utf-8"),
+            notif_msg=notif_id,
         )
 
         # Begin async xfer.
         self.nixl_wrapper.transfer(handle)
 
         # Use handle to check completion in future step().
-        self._recving_transfers[request_id].append(handle)
+        # TODO (NickLucche) surface xfer elapsed time
+        self._recving_transfers[request_id].append(
+            (handle, time.perf_counter()))
 
     def _get_block_descs_ids(self,
                              engine_id: str,
@@ -806,7 +986,6 @@ class NixlConnectorWorker:
         If layer_idx is provided, we use the region_ids for the given layer.
         Otherwise, we use all regions.
         """
-
         if layer_idx is None:
             region_ids = range(self.num_regions)
         else:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 0421a65a2c819e4fc9dfbd7549e757ae21b416a5..f86b92692a0e56920a1e97c4caa649250ae6c835 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import os
 from dataclasses import dataclass
diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
index 819c06805ee47ee22ef80309095771e9954437c7..8633fdaf59f8b48a028dda8d92251af1f4ecc025 100644
--- a/vllm/distributed/kv_transfer/kv_connector_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A centralized entrypoint to perform distributed KV cache transfer.
 
 This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index d1ffb8092dfc984301f274ca26d50df56629cce5..eef14269f1961b94d4ec8532c3a928c2657a539c 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains a new class `KVLookupBufferBase` that allows developers to
 think of KV cache operations as inserting new KV cache entries (`insert`)
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
index 5bb7110216768dc0e55eac6e92bedb718f5f1707..4381aad1e9956823f1f1f4a03a9a4784cd55adcb 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains a new class `MooncakeStore` that allows developers to
 think of KV cache transfer operations as putting new KV cache entries
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index e3b2274bd8a41855c5db779a8511a1578ae72432..a0ff7c320f61e6a07649ff0885323ed16d8a85dc 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
     Implements a distributed key-value (KV) cache transfer mechanism.
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 40589fb3ef872e97dd0d2577965824548f180323..1423fd032477eae7255806546c79d1e080ec4cef 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file defines an interface `KVPipeBase`
 that provides an abstraction for sending and receiving tensors, or None, via
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index aa4b1ba71492cf49351fe22594d97a7160ae6db7..9f3494b8106e2d74bf671fcadcf0d43c3c308a68 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 761c56f7e41f5be218367c21ccafacb0450c7949..09de0b682efca795d0b8ec874c13a5200c3ad043 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
     This module implements a PyNccl pipe for sending and receiving
     Optional[torch.Tensor] between distributed ranks with advanced
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 25d2f2cf5c6e6413a87eee48766a8b5dc4801852..60f1d5d8bca758822db1a6e1860db4fdd4dfe014 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING, Optional
 
 from vllm import envs
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b674d05a7771bbb12a59766c0c4e68b39fb83b91..10f87c49baa9e80c7c54736c9372a0e036ba8c74 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Adapted from
@@ -41,8 +42,8 @@ from vllm.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
-                        supports_custom_op)
+from vllm.utils import (direct_register_custom_op, get_distributed_init_method,
+                        resolve_obj_by_qualname, supports_custom_op)
 
 
 @dataclass
@@ -929,7 +930,7 @@ def init_distributed_environment(
         world_size = parallel_config.world_size_across_dp
         ip = parallel_config.data_parallel_master_ip
         port = parallel_config.get_next_dp_init_port()
-        distributed_init_method = f"tcp://{ip}:{port}"  # noqa
+        distributed_init_method = get_distributed_init_method(ip, port)
         logger.info(
             "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
             world_size, rank, distributed_init_method)
@@ -1203,7 +1204,8 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     if empty_cache is not None:
         empty_cache()
     try:
-        torch._C._host_emptyCache()
+        if not current_platform.is_cpu():
+            torch._C._host_emptyCache()
     except AttributeError:
         logger.warning(
             "torch._C._host_emptyCache() only available in Pytorch >=2.5")
diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ab2eb3a62f63d9189d877591bc94077cf6bc60
--- /dev/null
+++ b/vllm/distributed/tpu_distributed_utils.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import OrderedDict
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_xla.distributed.spmd as xs
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+
+logger = init_logger(__name__)
+
+
+class XlaQKVParallelLinear(nn.Module):
+
+    def __init__(self,
+                 qkv_linear: nn.Module,
+                 mesh: Optional["xs.Mesh"] = None):
+        super().__init__()
+        assert isinstance(qkv_linear, QKVParallelLinear)
+        self.skip_bias_add = qkv_linear.skip_bias_add
+        self.return_bias = qkv_linear.return_bias
+        assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD."
+
+        self.q_weight: Parameter
+        self.k_weight: Parameter
+        self.v_weight: Parameter
+        self.q_bias: Optional[Parameter]
+        self.k_bias: Optional[Parameter]
+        self.v_bias: Optional[Parameter]
+        self._load_weights_from_qkv_linear(qkv_linear)
+        if mesh is not None:
+            self._shard_weight(mesh)
+
+    def _shard_weight(self, mesh: "xs.Mesh"):
+        self.q_weight = Parameter(self.q_weight.to('xla'), requires_grad=False)
+        self.k_weight = Parameter(self.k_weight.to('xla'), requires_grad=False)
+        self.v_weight = Parameter(self.v_weight.to('xla'), requires_grad=False)
+        xs.mark_sharding(self.q_weight, mesh, ('x', None))
+        xs.mark_sharding(self.k_weight, mesh, ('x', None))
+        xs.mark_sharding(self.v_weight, mesh, ('x', None))
+        if self.q_bias is not None:
+            assert self.k_bias is not None and self.v_bias is not None, \
+                "QKVParallelLinear should have q, k, and v biases together."
+            self.q_bias = Parameter(self.q_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.q_bias, mesh, ('x', ))
+            self.k_bias = Parameter(self.k_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.k_bias, mesh, ('x', ))
+            self.v_bias = Parameter(self.v_bias.to('xla'), requires_grad=False)
+            xs.mark_sharding(self.v_bias, mesh, ('x', ))
+
+    def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module):
+        q_proj_size, k_proj_size, _ = qkv_linear.output_sizes
+        # The weight of qkv linear is a concatenation of q, k, and v weights
+        # along the output dimension.
+        qkv_weight = qkv_linear.weight.data.cpu()
+        q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False)
+        k_weight = Parameter(qkv_weight[q_proj_size:q_proj_size + k_proj_size],
+                             requires_grad=False)
+        v_weight = Parameter(qkv_weight[q_proj_size + k_proj_size:],
+                             requires_grad=False)
+        self.register_parameter("q_weight", q_weight)
+        self.register_parameter("k_weight", k_weight)
+        self.register_parameter("v_weight", v_weight)
+
+        if qkv_linear.bias is not None:
+            q_bias = Parameter(qkv_linear.bias[:q_proj_size],
+                               requires_grad=False)
+            k_bias = Parameter(qkv_linear.bias[q_proj_size:q_proj_size +
+                                               k_proj_size],
+                               requires_grad=False)
+            v_bias = Parameter(qkv_linear.bias[q_proj_size + k_proj_size:],
+                               requires_grad=False)
+            self.register_parameter("q_bias", q_bias)
+            self.register_parameter("k_bias", k_bias)
+            self.register_parameter("v_bias", v_bias)
+        else:
+            self.register_parameter("q_bias", None)
+            self.register_parameter("k_bias", None)
+            self.register_parameter("v_bias", None)
+
+    def forward(self, input):
+        # Same forward functionality as QKVParallelLinear, but doing qkv porj
+        # separately.
+        q_bias = self.q_bias if not self.skip_bias_add else None
+        k_bias = self.k_bias if not self.skip_bias_add else None
+        v_bias = self.v_bias if not self.skip_bias_add else None
+        q_proj = F.linear(input, self.q_weight, q_bias)
+        k_proj = F.linear(input, self.k_weight, k_bias)
+        v_proj = F.linear(input, self.v_weight, v_bias)
+        # The q/k/v projections will be split outside of the QKVParallelLinear.
+        # Because we are replacing XlaQKVParallelLinear with the
+        # QKVParallelLinear, we need to concatenate q, k, and v projections to
+        # match the output shape of the QKVParallelLinear implementation even if
+        # it seems to be redundant.
+        # The concat and the following split will be noop, and should be
+        # optimized away by the compiler.
+        qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1)
+        output_bias = torch.cat([q_bias, k_bias, v_bias], dim=-1) if \
+                            self.skip_bias_add else None
+        if not self.return_bias:
+            return qkv_proj
+        return qkv_proj, output_bias
+
+
+def partition_column_parallel_linear(layer: torch.nn.Module,
+                                     mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, ColumnParallelLinear)
+    xs.mark_sharding(layer.weight, mesh, ('x', None))
+    logger.debug("Applied column-parallel sharding to %s", layer)
+    return layer
+
+
+def partition_row_parallel_linear(layer: torch.nn.Module,
+                                  mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, RowParallelLinear)
+    xs.mark_sharding(layer.weight, mesh, (None, 'x'))
+    logger.debug("Applied row-parallel sharding to %s", layer)
+    return layer
+
+
+def partition_qkv_parallel_linear(layer: torch.nn.Module,
+                                  mesh: xs.Mesh) -> torch.nn.Module:
+    assert isinstance(layer, QKVParallelLinear)
+    xla_layer = XlaQKVParallelLinear(layer, mesh)
+    logger.debug("Applied qkv parallel sharding to %s", layer)
+    return xla_layer
+
+
+MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict([
+    ("QKVParallelLinear", partition_qkv_parallel_linear),
+    ("ColumnParallelLinear", partition_column_parallel_linear),
+    ("RowParallelLinear", partition_row_parallel_linear),
+])
+
+
+def get_fqn(module):
+    # Get the fully qualified name of the module
+    return module.__class__.__qualname__
+
+
+def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None:
+    """
+    Recursively check a PyTorch model and apply appropriate sharding based on 
+    the MODULE_TYPE_TO_WRAPPING_FUNC mapping.
+    
+    Args:
+        model: torch.nn.Module to process
+        mesh: An XLA SPMD mesh object used for sharding
+    """
+
+    def _process_module(module, name=None, parent=None):
+        for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items():
+            if get_fqn(module) == module_type:
+                wrapped_module = wrapping_func(module, mesh)
+
+                assert parent is not None and name is not None, (
+                    "Top Level module is not expected to be wrapped.")
+                if wrapped_module is not module:
+                    # Wrapped module and module are different py object.
+                    # The original module should be replaced by the
+                    # wrapped_module.
+                    logger.debug("replace %s with %s", module, wrapped_module)
+                    setattr(parent, name, wrapped_module)
+
+                module = wrapped_module
+                break
+
+        for child_name, child_module in list(module.named_children()):
+            _process_module(child_module, child_name, module)
+
+    _process_module(model)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 93a069d36c4b136ec4f154e2adcb13ff10051568..67f71643d039c3f6b05e93172017f7c3baa77723 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 import dataclasses
-import datetime
 import os
 import pickle
 import socket
@@ -14,14 +14,14 @@ import time
 import uuid
 from collections import deque
 from collections.abc import Sequence
+from datetime import timedelta
 from typing import Any, Optional
 
 import torch
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
-                                                _unregister_process_group,
-                                                is_nccl_available)
+                                                _unregister_process_group)
 from torch.distributed.rendezvous import rendezvous
 
 import vllm.envs as envs
@@ -406,7 +406,7 @@ class StatelessProcessGroup:
             port=port,
             world_size=world_size,
             is_master=launch_server,
-            timeout=datetime.timedelta(seconds=store_timeout),
+            timeout=timedelta(seconds=store_timeout),
             use_libuv=False,  # for now: github.com/pytorch/pytorch/pull/150215
             master_listen_fd=listen_fd,
         )
@@ -419,6 +419,43 @@ class StatelessProcessGroup:
             data_expiration_seconds=data_expiration_seconds)
 
 
+def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
+                            group_rank: int, group_size: int,
+                            timeout: timedelta) -> ProcessGroup:
+    """
+    Stateless init ProcessGroup with gloo backend compatible with 
+    different torch versions.
+    """
+    if is_torch_equal_or_newer("2.6"):
+        pg = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+    else:
+        options = ProcessGroup.Options(backend=backend)
+        pg = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+            options,
+        )
+    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    backend_class = ProcessGroupGloo(prefix_store,
+                                     group_rank,
+                                     group_size,
+                                     timeout=timeout)
+    backend_type = ProcessGroup.BackendType.GLOO
+    device = torch.device("cpu")
+    if is_torch_equal_or_newer("2.6"):
+        # _set_default_backend is supported in torch >= 2.6
+        pg._set_default_backend(backend_type)
+    backend_class._set_sequence_number_for_group()
+
+    pg._register_backend(device, backend_type, backend_class)
+    return pg
+
+
 def stateless_init_torch_distributed_process_group(
         host: str, port: int, rank: int, world_size: int,
         backend: str) -> ProcessGroup:
@@ -468,40 +505,19 @@ def stateless_init_torch_distributed_process_group(
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
 
-    pg: ProcessGroup = ProcessGroup(
-        prefix_store,
-        group_rank,
-        group_size,
-    )
-
     if backend == "gloo":
-        from torch.distributed.distributed_c10d import ProcessGroupGloo
-        backend_class = ProcessGroupGloo(prefix_store,
-                                         group_rank,
-                                         group_size,
-                                         timeout=timeout)
-        backend_type = ProcessGroup.BackendType.GLOO
-        device = torch.device("cpu")
-    elif backend == "nccl":
-        assert is_nccl_available()
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-    else:
-        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
-
-    pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
-
-    return pg
+        return init_gloo_process_group(backend=backend,
+                                       prefix_store=prefix_store,
+                                       group_rank=group_rank,
+                                       group_size=group_size,
+                                       timeout=timeout)
+    from vllm.platforms import current_platform
+    return current_platform.stateless_init_device_torch_dist_pg(
+        backend=backend,
+        prefix_store=prefix_store,
+        group_rank=group_rank,
+        group_size=group_size,
+        timeout=timeout)
 
 
 def stateless_destroy_torch_distributed_process_group(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 442e4100fea1c2016d54de0ebfb3211a2bfcad99..4ce1b41e4f87fd280a6f91a84ef28d18cb341567 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 import argparse
@@ -14,6 +15,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
 
 import regex as re
 import torch
+from pydantic import TypeAdapter, ValidationError
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
@@ -38,7 +40,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
-                        GiB_bytes, is_in_doc_build, is_in_ray_actor)
+                        GiB_bytes, get_ip, is_in_ray_actor)
 
 # yapf: enable
 
@@ -149,16 +151,29 @@ def is_not_builtin(type_hint: TypeHint) -> bool:
     return type_hint.__module__ != "builtins"
 
 
+def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
+    """Extract type hints from Annotated or Union type hints."""
+    type_hints: set[TypeHint] = set()
+    origin = get_origin(type_hint)
+    args = get_args(type_hint)
+
+    if origin is Annotated:
+        type_hints.update(get_type_hints(args[0]))
+    elif origin is Union:
+        for arg in args:
+            type_hints.update(get_type_hints(arg))
+    else:
+        type_hints.add(type_hint)
+
+    return type_hints
+
+
 def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     cls_docs = get_attr_docs(cls)
     kwargs = {}
     for field in fields(cls):
         # Get the set of possible types for the field
-        type_hints: set[TypeHint] = set()
-        if get_origin(field.type) in {Union, Annotated}:
-            type_hints.update(get_args(field.type))
-        else:
-            type_hints.add(field.type)
+        type_hints: set[TypeHint] = get_type_hints(field.type)
 
         # If the field is a dataclass, we can use the model_validate_json
         generator = (th for th in type_hints if is_dataclass(th))
@@ -168,10 +183,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         if field.default is not MISSING:
             default = field.default
         elif field.default_factory is not MISSING:
-            if is_dataclass(field.default_factory) and is_in_doc_build():
-                default = {}
-            else:
-                default = field.default_factory()
+            default = field.default_factory()
 
         # Get the help text for the field
         name = field.name
@@ -189,12 +201,16 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
         - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n"""
         if dataclass_cls is not None:
-            dataclass_init = lambda x, f=dataclass_cls: f(**json.loads(x))
-            # Special case for configs with a from_cli method
-            if hasattr(dataclass_cls, "from_cli"):
-                from_cli = dataclass_cls.from_cli
-                dataclass_init = lambda x, f=from_cli: f(x)
-            kwargs[name]["type"] = dataclass_init
+
+            def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
+                try:
+                    if hasattr(cls, "from_cli"):
+                        return cls.from_cli(val)
+                    return TypeAdapter(cls).validate_json(val)
+                except ValidationError as e:
+                    raise argparse.ArgumentTypeError(repr(e)) from e
+
+            kwargs[name]["type"] = parse_dataclass
             kwargs[name]["help"] += json_tip
         elif contains_type(type_hints, bool):
             # Creates --no-<name> and --<name> flags
@@ -221,16 +237,15 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         elif contains_type(type_hints, int):
             kwargs[name]["type"] = int
             # Special case for large integers
-            if name in {"max_model_len"}:
+            if name in {"max_model_len", "max_num_batched_tokens"}:
                 kwargs[name]["type"] = human_readable_int
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
-        elif contains_type(type_hints,
-                           dict) and (contains_type(type_hints, str) or any(
-                               is_not_builtin(th) for th in type_hints)):
+        elif (contains_type(type_hints, dict)
+              and (contains_type(type_hints, str)
+                   or any(is_not_builtin(th) for th in type_hints))):
             kwargs[name]["type"] = union_dict_and_str
         elif contains_type(type_hints, dict):
-            # Dict arguments will always be optional
             kwargs[name]["type"] = parse_type(json.loads)
             kwargs[name]["help"] += json_tip
         elif (contains_type(type_hints, str)
@@ -290,6 +305,7 @@ class EngineArgs:
     data_parallel_size_local: Optional[int] = None
     data_parallel_address: Optional[str] = None
     data_parallel_rpc_port: Optional[int] = None
+    data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     max_parallel_loading_workers: Optional[
         int] = ParallelConfig.max_parallel_loading_workers
@@ -317,8 +333,7 @@ class EngineArgs:
     rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
     rope_theta: Optional[float] = ModelConfig.rope_theta
     hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
-    hf_overrides: Optional[HfOverrides] = \
-        get_field(ModelConfig, "hf_overrides")
+    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
     tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
     quantization: Optional[QuantizationMethods] = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
@@ -372,6 +387,9 @@ class EngineArgs:
         bool] = SchedulerConfig.enable_chunked_prefill
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
+    disable_hybrid_kv_cache_manager: bool = (
+        SchedulerConfig.disable_hybrid_kv_cache_manager)
+
     guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
     guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
     guided_decoding_disable_any_whitespace: bool = \
@@ -398,7 +416,8 @@ class EngineArgs:
         get_field(ModelConfig, "override_neuron_config")
     override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
         ModelConfig.override_pooler_config
-    compilation_config: Optional[CompilationConfig] = None
+    compilation_config: CompilationConfig = \
+        get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
@@ -413,13 +432,17 @@ class EngineArgs:
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
 
-    additional_config: Optional[Dict[str, Any]] = None
+    additional_config: dict[str, Any] = \
+        get_field(VllmConfig, "additional_config")
     enable_reasoning: Optional[bool] = None  # DEPRECATED
     reasoning_parser: str = DecodingConfig.reasoning_backend
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
 
+    enable_multimodal_encoder_data_parallel: bool = \
+        ParallelConfig.enable_multimodal_encoder_data_parallel
+
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -448,7 +471,7 @@ class EngineArgs:
             title="ModelConfig",
             description=ModelConfig.__doc__,
         )
-        if 'serve' not in sys.argv[1:] and '--help' not in sys.argv[1:]:
+        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
             model_group.add_argument("--model", **model_kwargs["model"])
         model_group.add_argument("--task", **model_kwargs["task"])
         model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
@@ -618,6 +641,12 @@ class EngineArgs:
                                     type=int,
                                     help='Port for data parallel RPC '
                                     'communication.')
+        parallel_group.add_argument('--data-parallel-backend',
+                                    '-dpb',
+                                    type=str,
+                                    default='mp',
+                                    help='Backend for data parallel, either '
+                                    '"mp" or "ray".')
         parallel_group.add_argument(
             "--enable-expert-parallel",
             **parallel_kwargs["enable_expert_parallel"])
@@ -634,6 +663,9 @@ class EngineArgs:
                                     **parallel_kwargs["worker_cls"])
         parallel_group.add_argument("--worker-extension-cls",
                                     **parallel_kwargs["worker_extension_cls"])
+        parallel_group.add_argument(
+            "--enable-multimodal-encoder-data-parallel",
+            **parallel_kwargs["enable_multimodal_encoder_data_parallel"])
 
         # KV cache arguments
         cache_kwargs = get_kwargs(CacheConfig)
@@ -820,6 +852,9 @@ class EngineArgs:
             **scheduler_kwargs["disable_chunked_mm_input"])
         scheduler_group.add_argument("--scheduler-cls",
                                      **scheduler_kwargs["scheduler_cls"])
+        scheduler_group.add_argument(
+            "--disable-hybrid-kv-cache-manager",
+            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
@@ -1050,9 +1085,20 @@ class EngineArgs:
 
         # DP address, used in multi-node case for torch distributed group
         # and ZMQ sockets.
-        data_parallel_address = self.data_parallel_address if (
-            self.data_parallel_address
-            is not None) else ParallelConfig.data_parallel_master_ip
+        if self.data_parallel_address is None:
+            if self.data_parallel_backend == "ray":
+                host_ip = get_ip()
+                logger.info(
+                    "Using host IP %s as ray-based data parallel address",
+                    host_ip)
+                data_parallel_address = host_ip
+            else:
+                assert self.data_parallel_backend == "mp", (
+                    "data_parallel_backend can only be ray or mp, got %s",
+                    self.data_parallel_backend)
+                data_parallel_address = ParallelConfig.data_parallel_master_ip
+        else:
+            data_parallel_address = self.data_parallel_address
 
         # This port is only used when there are remote data parallel engines,
         # otherwise the local IPC transport is used.
@@ -1060,6 +1106,8 @@ class EngineArgs:
             self.data_parallel_rpc_port
             is not None) else ParallelConfig.data_parallel_rpc_port
 
+        data_parallel_backend = self.data_parallel_backend
+
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
@@ -1067,6 +1115,7 @@ class EngineArgs:
             data_parallel_size_local=data_parallel_size_local,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
+            data_parallel_backend=data_parallel_backend,
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
@@ -1075,6 +1124,8 @@ class EngineArgs:
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
+            enable_multimodal_encoder_data_parallel=self.
+            enable_multimodal_encoder_data_parallel,
         )
 
         speculative_config = self.create_speculative_config(
@@ -1129,6 +1180,8 @@ class EngineArgs:
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
+            disable_hybrid_kv_cache_manager=self.
+            disable_hybrid_kv_cache_manager,
         )
 
         lora_config = LoRAConfig(
@@ -1245,7 +1298,7 @@ class EngineArgs:
         # Skip this check if we are running on a non-GPU platform,
         # or if the device capability is not available
         # (e.g. in a Ray actor without GPUs).
-        from vllm.platforms import current_platform
+        from vllm.platforms import CpuArchEnum, current_platform
         if (current_platform.is_cuda()
                 and current_platform.get_device_capability()
                 and current_platform.get_device_capability().major < 8):
@@ -1350,10 +1403,13 @@ class EngineArgs:
             "PALLAS_VLLM_V1",
             "TRITON_ATTN_VLLM_V1",
             "TRITON_MLA",
+            "CUTLASS_MLA_VLLM_V1",
             "FLASHMLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
             "ROCM_AITER_MLA",
+            "TORCH_SDPA_VLLM_V1",
+            "FLEX_ATTENTION",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1377,7 +1433,8 @@ class EngineArgs:
 
         if (self.pipeline_parallel_size > 1
                 and self.distributed_executor_backend
-                not in ("ray", "mp", "external_launcher")):
+                not in (ParallelConfig.distributed_executor_backend, "ray",
+                        "mp", "external_launcher")):
             name = "Pipeline Parallelism without Ray distributed executor " \
                     "or multiprocessing executor or external launcher"
             _raise_or_fallback(feature_name=name, recommend_to_remove=False)
@@ -1385,7 +1442,9 @@ class EngineArgs:
 
         # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
         v0_hardware = not any(
-            (current_platform.is_cuda(), current_platform.is_tpu()))
+            (current_platform.is_cuda(), current_platform.is_tpu(),
+             (current_platform.is_cpu()
+              and current_platform.get_cpu_architecture() == CpuArchEnum.X86)))
         if v0_hardware and _warn_or_fallback(  # noqa: SIM103
                 current_platform.device_name):
             return False
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 19b219b674f389fd7220f2068d7f943c01d5b736..3d7d28055dd00106d84147d7ec8c5b401680d97b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,16 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
 import time
 import weakref
 from functools import partial
-from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
-                    List, Mapping, Optional, Set, Tuple, Type, Union, overload)
+from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
+                    Mapping, Optional, Set, Tuple, Type, Union)
 from weakref import ReferenceType
 
-from typing_extensions import deprecated
-
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VllmConfig)
@@ -35,7 +34,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, deprecate_kwargs, weak_bind
+from vllm.utils import Device, weak_bind
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -428,23 +427,6 @@ class _AsyncLLMEngine(LLMEngine):
         return await (
             self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
 
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    async def add_request_async(
-        self,
-        request_id: str,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        ...
-
-    @overload
     async def add_request_async(
         self,
         request_id: str,
@@ -455,32 +437,12 @@ class _AsyncLLMEngine(LLMEngine):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> None:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    async def add_request_async(
-            self,
-            request_id: str,
-            prompt: Optional[PromptType] = None,
-            params: Optional[Union[SamplingParams, PoolingParams]] = None,
-            arrival_time: Optional[float] = None,
-            lora_request: Optional[LoRARequest] = None,
-            trace_headers: Optional[Mapping[str, str]] = None,
-            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-            priority: int = 0,
-            *,
-            inputs: Optional[PromptType] = None,  # DEPRECATED
-    ) -> None:
-        """Async version of
-        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
-        if inputs is not None:
-            prompt = inputs
-        assert prompt is not None and params is not None
-
+        """
+        Async version of
+        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
+        """
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
@@ -490,6 +452,10 @@ class _AsyncLLMEngine(LLMEngine):
         if arrival_time is None:
             arrival_time = time.time()
 
+        if data_parallel_rank is not None:
+            raise ValueError("Targeting data_parallel_rank only supported "
+                             "in v1 client.")
+
         if (isinstance(prompt, dict)
                 and prompt.get("prompt_embeds", None) is not None
                 and not prompt.get("prompt_token_ids", None)):
@@ -513,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine):
             params = await build_guided_decoding_logits_processor_async(
                 sampling_params=params,
                 tokenizer=await self.get_tokenizer_async(lora_request),
-                default_guided_backend=self.decoding_config.
-                guided_decoding_backend,
+                default_guided_backend=self.decoding_config.backend,
                 reasoning_backend=self.decoding_config.reasoning_backend,
                 model_config=self.model_config)
 
@@ -886,27 +851,7 @@ class AsyncLLMEngine(EngineClient):
                 raise
             await asyncio.sleep(0)
 
-    # This method does not need to be async, but kept that way
-    # for backwards compatibility.
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def add_request(
-        self,
-        request_id: str,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, PoolingRequestOutput], None]]:
-        ...
-
-    @overload
-    def add_request(
+    async def add_request(
         self,
         request_id: str,
         prompt: PromptType,
@@ -916,31 +861,8 @@ class AsyncLLMEngine(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, PoolingRequestOutput], None]]:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    async def add_request(
-        self,
-        request_id: str,
-        prompt: Optional[PromptType] = None,
-        params: Optional[Union[SamplingParams, PoolingParams]] = None,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-        *,
-        inputs: Optional[PromptType] = None,  # DEPRECATED
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        if inputs is not None:
-            prompt = inputs
-        assert prompt is not None and params is not None
-
         if not self.is_running:
             if self.start_engine_loop:
                 self.start_background_loop()
@@ -966,6 +888,7 @@ class AsyncLLMEngine(EngineClient):
             trace_headers=trace_headers,
             prompt_adapter_request=prompt_adapter_request,
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
 
         return stream.generator()
@@ -979,6 +902,7 @@ class AsyncLLMEngine(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request.
 
@@ -998,7 +922,8 @@ class AsyncLLMEngine(EngineClient):
                                             for generation, if any.
             priority: The priority of the request.
                 Only applicable with priority scheduling.
-
+            data_parallel_rank: The (global) data parallel rank that must
+                handle this request. Only applicable if DP is enabled.
         Yields:
             The output `RequestOutput` objects from the LLMEngine
             for the request.
@@ -1056,6 +981,7 @@ class AsyncLLMEngine(EngineClient):
                     trace_headers=trace_headers,
                     prompt_adapter_request=prompt_adapter_request,
                     priority=priority,
+                    data_parallel_rank=data_parallel_rank,
             ):
                 yield LLMEngine.validate_output(output, RequestOutput)
         except asyncio.CancelledError:
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index aa54c0693941fbc191be31ea9d26940673d60c51..28a023a71ef52658e3abccc909365edc901eba36 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Workaround for https://github.com/python/cpython/issues/86296
 #
@@ -8,7 +9,6 @@
 import asyncio
 import enum
 import sys
-import warnings
 from types import TracebackType
 from typing import Any, Optional, Type
 
@@ -66,24 +66,6 @@ else:
             else:
                 self.update(deadline)
 
-        def __enter__(self) -> "Timeout":
-            warnings.warn(
-                "with timeout() is deprecated, use async with timeout()",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            self._do_enter()
-            return self
-
-        def __exit__(
-            self,
-            exc_type: Optional[Type[BaseException]],
-            exc_val: Optional[BaseException],
-            exc_tb: Optional[TracebackType],
-        ) -> Optional[bool]:
-            self._do_exit(exc_type)
-            return None
-
         async def __aenter__(self) -> "Timeout":
             self._do_enter()
             return self
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ff33d566ab68401a42a367a5ec9fc08120b38bd9..8fccf9bd2aa0008d77bfd10b45f51d01fec1a653 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import time
@@ -10,10 +11,10 @@ from functools import partial
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
                     Iterable, List, Literal, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Type, Union, cast, overload
+from typing import Set, Type, Union, cast
 
 import torch
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@@ -57,8 +58,7 @@ from vllm.transformers_utils.tokenizer_group import (
     TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import (Counter, Device, deprecate_kwargs,
-                        resolve_obj_by_qualname, weak_bind)
+from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
 from vllm.version import __version__ as VLLM_VERSION
 from vllm.worker.model_runner_base import InputProcessingError
 
@@ -628,7 +628,6 @@ class LLMEngine:
     def stop_remote_worker_execution_loop(self) -> None:
         self.model_executor.stop_remote_worker_execution_loop()
 
-    @overload
     def add_request(
         self,
         request_id: str,
@@ -640,42 +639,6 @@ class LLMEngine:
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> None:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def add_request(
-        self,
-        request_id: str,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def add_request(
-            self,
-            request_id: str,
-            prompt: Optional[PromptType] = None,
-            params: Optional[Union[SamplingParams, PoolingParams]] = None,
-            arrival_time: Optional[float] = None,
-            lora_request: Optional[LoRARequest] = None,
-            tokenization_kwargs: Optional[dict[str, Any]] = None,
-            trace_headers: Optional[Mapping[str, str]] = None,
-            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-            priority: int = 0,
-            *,
-            inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -724,10 +687,6 @@ class LLMEngine:
             >>> # continue the request processing
             >>> ...
         """
-        if inputs is not None:
-            prompt = inputs
-        assert prompt is not None and params is not None
-
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
@@ -1680,9 +1639,6 @@ class LLMEngine:
         time_inference_requests: List[float] = []
         time_prefill_requests: List[float] = []
         time_decode_requests: List[float] = []
-        time_in_queue_requests: List[float] = []
-        model_forward_time_requests: List[float] = []
-        model_execute_time_requests: List[float] = []
         #   Metadata
         num_prompt_tokens_requests: List[int] = []
         num_generation_tokens_requests: List[int] = []
@@ -1790,15 +1746,6 @@ class LLMEngine:
                             now - seq_group.metrics.first_token_time)
                         time_inference_requests.append(
                             now - seq_group.metrics.first_scheduled_time)
-                    if seq_group.metrics.time_in_queue is not None:
-                        time_in_queue_requests.append(
-                            seq_group.metrics.time_in_queue)
-                    if seq_group.metrics.model_forward_time is not None:
-                        model_forward_time_requests.append(
-                            seq_group.metrics.model_forward_time)
-                    if seq_group.metrics.model_execute_time is not None:
-                        model_execute_time_requests.append(
-                            seq_group.metrics.model_execute_time * 1000)
                     # Metadata
                     num_prompt_tokens_requests.append(
                         len(seq_group.prompt_token_ids))
@@ -1867,9 +1814,6 @@ class LLMEngine:
             time_inference_requests=time_inference_requests,
             time_prefill_requests=time_prefill_requests,
             time_decode_requests=time_decode_requests,
-            time_in_queue_requests=time_in_queue_requests,
-            model_forward_time_requests=model_forward_time_requests,
-            model_execute_time_requests=model_execute_time_requests,
             #   Metadata
             num_prompt_tokens_requests=num_prompt_tokens_requests,
             num_generation_tokens_requests=num_generation_tokens_requests,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 34b48f83b6436683fc7bed44f1d62c04a86ac5aa..8d51f0472351bcef3e033cab6c74d235ae8c7cd8 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from typing import TYPE_CHECKING
@@ -80,17 +81,6 @@ class Metrics:
             multiprocess_mode="livemostrecent",
         )
 
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.gauge_scheduler_swapped = self._gauge_cls(
-                name="vllm:num_requests_swapped",
-                documentation=(
-                    "Number of requests swapped to CPU. "
-                    "DEPRECATED: KV cache offloading is not used in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
@@ -98,35 +88,6 @@ class Metrics:
             labelnames=labelnames,
             multiprocess_mode="sum")
 
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.gauge_cpu_cache_usage = self._gauge_cls(
-                name="vllm:cpu_cache_usage_perc",
-                documentation=(
-                    "CPU KV-cache usage. 1 means 100 percent usage. "
-                    "DEPRECATED: KV cache offloading is not used in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-            self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
-                name="vllm:cpu_prefix_cache_hit_rate",
-                documentation=(
-                    "CPU prefix cache block hit rate. "
-                    "DEPRECATED: KV cache offloading is not used in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-
-        # Deprecated in 0.8 - replaced by queries+hits counters in V1
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
-                name="vllm:gpu_prefix_cache_hit_rate",
-                documentation=("GPU prefix cache block hit rate. "
-                               "DEPRECATED: use vllm:gpu_prefix_cache_queries "
-                               "and vllm:gpu_prefix_cache_queries in V1"),
-                labelnames=labelnames,
-                multiprocess_mode="sum")
-
         # Iteration stats
         self.counter_num_preemption = self._counter_cls(
             name="vllm:num_preemptions_total",
@@ -200,36 +161,6 @@ class Metrics:
             "Histogram of time spent in DECODE phase for request.",
             labelnames=labelnames,
             buckets=request_latency_buckets)
-        # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.histogram_time_in_queue_request = self._histogram_cls(
-                name="vllm:time_in_queue_requests",
-                documentation=
-                ("Histogram of time the request spent in the queue in seconds. "
-                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
-                labelnames=labelnames,
-                buckets=request_latency_buckets)
-
-        # Deprecated in 0.8 - use prefill/decode/inference time metrics
-        # Hidden in 0.9, due to be removed in 0.10
-        if self.show_hidden_metrics:
-            self.histogram_model_forward_time_request = self._histogram_cls(
-                name="vllm:model_forward_time_milliseconds",
-                documentation=
-                ("Histogram of time spent in the model forward pass in ms. "
-                 "DEPRECATED: use prefill/decode/inference time metrics instead"
-                 ),
-                labelnames=labelnames,
-                buckets=build_1_2_3_5_8_buckets(3000))
-            self.histogram_model_execute_time_request = self._histogram_cls(
-                name="vllm:model_execute_time_milliseconds",
-                documentation=
-                ("Histogram of time spent in the model execute function in ms."
-                 "DEPRECATED: use prefill/decode/inference time metrics instead"
-                 ),
-                labelnames=labelnames,
-                buckets=build_1_2_3_5_8_buckets(3000))
 
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -580,20 +511,10 @@ class PrometheusStatLogger(StatLoggerBase):
         # System state data
         self._log_gauge(self.metrics.gauge_scheduler_running,
                         stats.num_running_sys)
-        if self.metrics.show_hidden_metrics:
-            self._log_gauge(self.metrics.gauge_scheduler_swapped,
-                            stats.num_swapped_sys)
         self._log_gauge(self.metrics.gauge_scheduler_waiting,
                         stats.num_waiting_sys)
         self._log_gauge(self.metrics.gauge_gpu_cache_usage,
                         stats.gpu_cache_usage_sys)
-        if self.metrics.show_hidden_metrics:
-            self._log_gauge(self.metrics.gauge_cpu_cache_usage,
-                            stats.cpu_cache_usage_sys)
-            self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
-                            stats.cpu_prefix_cache_hit_rate)
-            self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
-                            stats.gpu_prefix_cache_hit_rate)
         # Including max-lora in metric, in future this property of lora
         # config maybe extended to be dynamic.
         lora_info = {
@@ -631,15 +552,6 @@ class PrometheusStatLogger(StatLoggerBase):
                             stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
-        if self.metrics.show_hidden_metrics:
-            self._log_histogram(self.metrics.histogram_time_in_queue_request,
-                                stats.time_in_queue_requests)
-            self._log_histogram(
-                self.metrics.histogram_model_forward_time_request,
-                stats.model_forward_time_requests)
-            self._log_histogram(
-                self.metrics.histogram_model_execute_time_request,
-                stats.model_execute_time_requests)
         # Metadata
         finished_reason_counter = CollectionsCounter(
             stats.finished_reason_requests)
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 9e6d5ef29bedb50515ff49c04513a2c44e73b34e..9375dc4c495babf9bfb5a05e0ca119d3eb667b28 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 These types are defined in this file to avoid importing vllm.engine.metrics
 and therefore importing prometheus_client.
@@ -53,9 +54,6 @@ class Stats:
     time_inference_requests: List[float]
     time_prefill_requests: List[float]
     time_decode_requests: List[float]
-    time_in_queue_requests: List[float]
-    model_forward_time_requests: List[float]
-    model_execute_time_requests: List[float]
     #   Metadata
     num_prompt_tokens_requests: List[int]
     num_generation_tokens_requests: List[int]
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index af72c8e6b7766480d3143272b0ef5cd35c210f5b..db968cd6b5d862ea92966a774a7e4d7625c50fa3 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import List, Mapping, Optional, Union, overload
-
-from typing_extensions import deprecated
+from typing import List, Mapping, Optional, Union
 
 from vllm import PoolingParams
 from vllm.inputs import PromptType
@@ -13,7 +12,7 @@ from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils import Device, deprecate_kwargs
+from vllm.utils import Device
 
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
 
@@ -37,7 +36,6 @@ class RPCProcessRequest:
     prompt_adapter_request: Optional[PromptAdapterRequest] = None
     priority: int = 0
 
-    @overload
     def __init__(
         self,
         prompt: PromptType,
@@ -48,44 +46,6 @@ class RPCProcessRequest:
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def __init__(
-        self,
-        *,
-        inputs: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def __init__(
-            self,
-            prompt: Optional[PromptType] = None,
-            params: Optional[Union[SamplingParams, PoolingParams]] = None,
-            request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None,
-            trace_headers: Optional[Mapping[str, str]] = None,
-            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-            priority: int = 0,
-            *,
-            inputs: Optional[PromptType] = None,  # DEPRECATED
-    ) -> None:
-        if inputs is not None:
-            prompt = inputs
-        assert (prompt is not None and params is not None
-                and request_id is not None)
-
         super().__init__()
 
         self.prompt = prompt
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 18b7c187bdffebf6c74f2e682a3360a8b7dfa81c..9e018ec7f344c55f6cd198ab1c5f3852132ddef8 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -1,17 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
 import pickle
 from contextlib import contextmanager, suppress
 from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
-                    Optional, Union, cast, overload)
+                    Optional, Union, cast)
 
 import cloudpickle
 import psutil
 import zmq
 import zmq.asyncio
-from typing_extensions import deprecated
 from zmq import Frame  # type: ignore[attr-defined]
 from zmq.asyncio import Socket
 
@@ -48,7 +48,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.utils import Device, deprecate_kwargs
+from vllm.utils import Device
 
 logger = init_logger(__name__)
 
@@ -441,7 +441,6 @@ class MQLLMEngineClient(EngineClient):
     def dead_error(self) -> BaseException:
         return ENGINE_DEAD_ERROR(self._errored_with)
 
-    @overload
     def generate(
         self,
         prompt: PromptType,
@@ -451,39 +450,6 @@ class MQLLMEngineClient(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def generate(
-        self,
-        *,
-        inputs: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def generate(
-        self,
-        prompt: Optional[PromptType] = None,
-        sampling_params: Optional[SamplingParams] = None,
-        request_id: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-        *,
-        inputs: Optional[PromptType] = None  # DEPRECATED
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request.
 
@@ -505,16 +471,12 @@ class MQLLMEngineClient(EngineClient):
                 Any priority other than 0 will lead to an error if the
                 scheduling policy is not "priority".
         """
-        if inputs is not None:
-            prompt = inputs
-        assert (prompt is not None and sampling_params is not None
-                and request_id is not None)
-
-        return self._process_request(prompt, sampling_params, request_id,
-                                     lora_request, trace_headers,
-                                     prompt_adapter_request, priority)
+        return cast(
+            AsyncGenerator[RequestOutput, None],
+            self._process_request(prompt, sampling_params, request_id,
+                                  lora_request, trace_headers,
+                                  prompt_adapter_request, priority))
 
-    @overload
     def encode(
         self,
         prompt: PromptType,
@@ -523,37 +485,6 @@ class MQLLMEngineClient(EngineClient):
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        ...
-
-    @overload
-    @deprecated("'inputs' will be renamed to 'prompt")
-    def encode(
-        self,
-        *,
-        inputs: PromptType,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        ...
-
-    @deprecate_kwargs(
-        "inputs",
-        additional_message="Please use the 'prompt' parameter instead.",
-    )
-    def encode(
-        self,
-        prompt: Optional[PromptType] = None,
-        pooling_params: Optional[PoolingParams] = None,
-        request_id: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        *,
-        inputs: Optional[PromptType] = None  # DEPRECATED
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from a pooling model.
 
@@ -574,11 +505,6 @@ class MQLLMEngineClient(EngineClient):
             The output `PoolingRequestOutput` objects from the LLMEngine
             for the request.
         """
-        if inputs is not None:
-            prompt = inputs
-        assert (prompt is not None and pooling_params is not None
-                and request_id is not None)
-
         return cast(
             AsyncGenerator[PoolingRequestOutput, None],
             self._process_request(prompt,
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 434cb49855621f8aa2ee706294ecd2450b4ef97d..ef088bd3933af4f743a3556f30415e5d8228c274 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 import signal
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 4c8e295c138150595faa0005094a77bd6ac7c68e..19c5963d32dbb6c6cb438e410916d2316d1c13d1 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Callable, List
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 110f84a65efc9adfa960a04596410880736aaa30..e0fa6a00ecfa4c4b0153e637e1143d2f4b77509d 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 from typing import Callable, List, cast
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index e88f119c87426dfffa2a0720d4b6cc84613a2649..dbf6a371d050a1698d46e72625908eea4b6be986 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List
 
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 6cad9ec8f327fd426e7f4b189dd2d933db02110d..3fb2f71b5e999a89a277a64dfb9e3703ec920f31 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, List, Optional, Tuple
 
@@ -81,7 +82,7 @@ class StopChecker:
             return
 
         # Check if the sequence has reached max_model_len.
-        if seq.get_len() > self._get_max_model_len(lora_req):
+        if seq.get_len() >= self._get_max_model_len(lora_req):
             seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
             return
 
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 0d2b58c109e32f27f780f4ef538331705db0321d..1e127eb982425ac0ad1c8142c37545352e593b2f 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List
 from typing import Sequence as GenericSequence
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a837a2d288a9c24ba37968bc06c9d92473e14988..727d59283643c29c4f8976e87c935b0ed972438f 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from abc import ABC, abstractmethod
@@ -65,6 +66,7 @@ class EngineClient(ABC):
         prompt: PromptType,
         request_id: str,
         params: BeamSearchParams,
+        lora_request: Optional[LoRARequest] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
 
         beam_width = params.beam_width
@@ -106,27 +108,31 @@ class EngineClient(ABC):
                                cum_logprob=0,
                                logprobs=[],
                                multi_modal_data=multi_modal_data,
-                               mm_processor_kwargs=mm_processor_kwargs)
+                               mm_processor_kwargs=mm_processor_kwargs,
+                               lora_request=lora_request)
         ]
         completed = []
 
         for _ in range(max_tokens):
-            prompts_batch = [
+            prompts_batch, lora_req_batch = zip(*[(
                 TokensPrompt(prompt_token_ids=beam.tokens,
                              multi_modal_data=beam.multi_modal_data,
-                             mm_processor_kwargs=beam.mm_processor_kwargs)
-                for beam in all_beams
-            ]
+                             mm_processor_kwargs=beam.mm_processor_kwargs),
+                beam.lora_request,
+            ) for beam in all_beams])
 
             tasks = []
 
             request_id = f"beam_search-{random_uuid()}"
-            for i, individual_prompt in enumerate(prompts_batch):
+            for i, (individual_prompt,
+                    lora_req) in enumerate(zip(prompts_batch, lora_req_batch)):
                 request_id_item = f"{request_id}-{i}"
                 task = asyncio.create_task(
                     collect_from_async_generator(
-                        self.generate(individual_prompt, beam_search_params,
-                                      request_id_item)))
+                        self.generate(individual_prompt,
+                                      beam_search_params,
+                                      request_id_item,
+                                      lora_request=lora_req)))
                 tasks.append(task)
 
             output = await asyncio.gather(*tasks)
@@ -159,6 +165,7 @@ class EngineClient(ABC):
                                     tokens=current_beam.tokens + [token_id],
                                     logprobs=current_beam.logprobs +
                                     [logprobs],
+                                    lora_request=current_beam.lora_request,
                                     cum_logprob=current_beam.cum_logprob +
                                     logprob_obj.logprob,
                                     multi_modal_data=current_beam.
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 1c027181156f1ca667b4e8f5f151bcd98df1ad98..3d1e5dc14d2f3d5c00a6b1b9c849dfc8a5e25ceb 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 NOTE: This API server is used only for demonstrating usage of AsyncEngine
 and simple performance benchmarks. It is not intended for production use.
@@ -16,6 +17,7 @@ from typing import Any, Optional
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
@@ -28,7 +30,6 @@ from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
-TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
 engine = None
 
@@ -133,7 +134,7 @@ async def run_server(args: Namespace,
         host=args.host,
         port=args.port,
         log_level=args.log_level,
-        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
         ssl_keyfile=args.ssl_keyfile,
         ssl_certfile=args.ssl_certfile,
         ssl_ca_certs=args.ssl_ca_certs,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ec1b327da9058cafb7bcaefcf780b120cdae567a..95c806c228b82ec99280918b261053ace8c7c518 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
@@ -1252,7 +1253,7 @@ def apply_hf_chat_template(
         # investigation.
         logger.exception(
             "An error occurred in `transformers` while applying chat template")
-        raise ValueError from e
+        raise ValueError(str(e)) from e
 
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
@@ -1281,7 +1282,7 @@ def apply_mistral_chat_template(
     # We convert those assertion errors to ValueErrors so they can be
     # are properly caught in the preprocessing_input step
     except (AssertionError, MistralCommonException) as e:
-        raise ValueError from e
+        raise ValueError(str(e)) from e
 
     # External library exceptions can sometimes occur despite the framework's
     # internal exception management capabilities.
@@ -1292,7 +1293,7 @@ def apply_mistral_chat_template(
         logger.exception(
             "An error occurred in `mistral_common` while applying chat "
             "template")
-        raise ValueError from e
+        raise ValueError(str(e)) from e
 
 def random_tool_call_id() -> str:
     return f"chatcmpl-tool-{random_uuid()}"
diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
index 94fb415f581f4e0d7da031cd5253642dcca82ac2..30a8844108002ce5759f8616e4157b52206bf962 100644
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.entrypoints.cli.types import CLISubcommand
diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py
index 5aca16e0b640c6e61615c53470ae22cd5fec88d4..e0358a262dcdcdbd3ec587e1706c7faed3534aab 100644
--- a/vllm/entrypoints/cli/benchmark/latency.py
+++ b/vllm/entrypoints/cli/benchmark/latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.latency import add_cli_args, main
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 9e857af7d6dbd12fa515647c3e867d8931dfb411..717da630ab4f0780af69b909dfa1a12aa682d0da 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 import vllm.entrypoints.cli.benchmark.latency
diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py
index d5a858920ebdb6514468bd1d57f2bb18347b1a5f..3043701570230b6df18dc35440d409e5789b0809 100644
--- a/vllm/entrypoints/cli/benchmark/serve.py
+++ b/vllm/entrypoints/cli/benchmark/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.serve import add_cli_args, main
diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py
index 88ee6aa0385783bc785bdefa75bc2787bf6d6a97..20431cd3d87020172b91533e2c88936e58508420 100644
--- a/vllm/entrypoints/cli/benchmark/throughput.py
+++ b/vllm/entrypoints/cli/benchmark/throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.throughput import add_cli_args, main
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
index 810ecfdf71c3281258d8895d5f3c9931efc16094..141aafdb1a61876391a9675f499c8b86a9d4b065 100644
--- a/vllm/entrypoints/cli/collect_env.py
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 6676c294c81c66c497a07183f5490863e3303512..9bb1162e38d821c2f689c5c16449e2406a42d190 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # The CLI entrypoint to vLLM.
 import signal
@@ -7,9 +8,10 @@ import sys
 import vllm.entrypoints.cli.benchmark.main
 import vllm.entrypoints.cli.collect_env
 import vllm.entrypoints.cli.openai
+import vllm.entrypoints.cli.run_batch
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
 CMD_MODULES = [
@@ -17,6 +19,7 @@ CMD_MODULES = [
     vllm.entrypoints.cli.serve,
     vllm.entrypoints.cli.benchmark.main,
     vllm.entrypoints.cli.collect_env,
+    vllm.entrypoints.cli.run_batch,
 ]
 
 
@@ -34,7 +37,7 @@ def main():
 
     parser = FlexibleArgumentParser(
         description="vLLM CLI",
-        epilog=VLLM_SERVE_PARSER_EPILOG,
+        epilog=VLLM_SUBCMD_PARSER_EPILOG,
     )
     parser.add_argument('-v',
                         '--version',
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 215fcf3c3e44e33edc6cfb52c0ae9b9ff291383a..58dcdfe217fd5316948e44addf67f07e3d2d78f8 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Commands that act as an interactive OpenAI API client
 
 import argparse
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bdd3b63c26d2c4f89867e4fdc5d08f3c9e5005c
--- /dev/null
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import asyncio
+
+from prometheus_client import start_http_server
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.logger import logger
+from vllm.entrypoints.openai.run_batch import main as run_batch_main
+from vllm.entrypoints.openai.run_batch import make_arg_parser
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
+from vllm.utils import FlexibleArgumentParser
+from vllm.version import __version__ as VLLM_VERSION
+
+
+class RunBatchSubcommand(CLISubcommand):
+    """The `run-batch` subcommand for vLLM CLI."""
+
+    def __init__(self):
+        self.name = "run-batch"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        logger.info("vLLM batch processing API version %s", VLLM_VERSION)
+        logger.info("args: %s", args)
+
+        # Start the Prometheus metrics server.
+        # LLMEngine uses the Prometheus client
+        # to publish metrics at the /metrics endpoint.
+        if args.enable_metrics:
+            logger.info("Prometheus metrics enabled")
+            start_http_server(port=args.port, addr=args.url)
+        else:
+            logger.info("Prometheus metrics disabled")
+
+        asyncio.run(run_batch_main(args))
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        run_batch_parser = subparsers.add_parser(
+            "run-batch",
+            help="Run batch prompts and write results to file.",
+            description=(
+                "Run batch prompts using vLLM's OpenAI-compatible API.\n"
+                "Supports local or HTTP input/output files."),
+            usage=
+            "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
+        )
+        run_batch_parser = make_arg_parser(run_batch_parser)
+        show_filtered_argument_or_group_from_help(run_batch_parser,
+                                                  "run-batch")
+        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
+        return run_batch_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [RunBatchSubcommand()]
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 957fec290bf264e785ccfbf66451c958362a68e5..51807a953e0212707108b0f229388b22693797c4 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -1,24 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
+import os
 import signal
+import sys
 
 import uvloop
+import zmq
 
 import vllm.envs as envs
 from vllm import AsyncEngineArgs
 from vllm.entrypoints.cli.types import CLISubcommand
-from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.api_server import (run_server, run_server_worker,
+                                                setup_server)
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
-from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG,
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
+from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_tcp_uri
+from vllm.utils import FlexibleArgumentParser, get_tcp_uri, zmq_socket_ctx
+from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.core_client import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
+from vllm.v1.utils import (APIServerProcessManager, CoreEngine,
+                           CoreEngineActorManager, EngineZmqAddresses,
+                           get_engine_client_zmq_addr,
+                           wait_for_completion_or_failure,
+                           wait_for_engine_startup)
 
 logger = init_logger(__name__)
 
@@ -36,9 +49,12 @@ class ServeSubcommand(CLISubcommand):
         if hasattr(args, 'model_tag') and args.model_tag is not None:
             args.model = args.model_tag
 
-        if args.headless:
+        if args.headless or args.api_server_count < 1:
             run_headless(args)
+        elif args.api_server_count > 1:
+            run_multi_api_server(args)
         else:
+            # Single API server (this process).
             uvloop.run(run_server(args))
 
     def validate(self, args: argparse.Namespace) -> None:
@@ -69,6 +85,11 @@ class ServeSubcommand(CLISubcommand):
             type=int,
             default=0,
             help='Starting data parallel rank for secondary nodes.')
+        serve_parser.add_argument('--api-server-count',
+                                  '-asc',
+                                  type=int,
+                                  default=1,
+                                  help='How many API server processes to run.')
         serve_parser.add_argument(
             "--config",
             type=str,
@@ -80,8 +101,8 @@ class ServeSubcommand(CLISubcommand):
         )
 
         serve_parser = make_arg_parser(serve_parser)
-        show_filtered_argument_or_group_from_help(serve_parser)
-        serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG
+        show_filtered_argument_or_group_from_help(serve_parser, "serve")
+        serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
         return serve_parser
 
 
@@ -91,23 +112,26 @@ def cmd_init() -> list[CLISubcommand]:
 
 def run_headless(args: argparse.Namespace):
 
+    if args.api_server_count > 1:
+        raise ValueError("api_server_count can't be set in headless mode")
+
     # Create the EngineConfig.
     engine_args = AsyncEngineArgs.from_cli_args(args)
     usage_context = UsageContext.OPENAI_API_SERVER
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
     if not envs.VLLM_USE_V1:
-        raise RuntimeError("Headless mode is only supported for V1")
+        raise ValueError("Headless mode is only supported for V1")
 
     parallel_config = vllm_config.parallel_config
     local_engine_count = parallel_config.data_parallel_size_local
     host = parallel_config.data_parallel_master_ip
     port = engine_args.data_parallel_rpc_port  # add to config too
-    input_address = get_tcp_uri(host, port)
+    handshake_address = get_tcp_uri(host, port)
 
     if local_engine_count <= 0:
-        raise RuntimeError("data_parallel_size_local must be > 0 in "
-                           "headless mode")
+        raise ValueError("data_parallel_size_local must be > 0 in "
+                         "headless mode")
 
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
@@ -119,7 +143,7 @@ def run_headless(args: argparse.Namespace):
 
     logger.info(
         "Launching %d data parallel engine(s) in headless mode, "
-        "with head node address %s.", local_engine_count, input_address)
+        "with head node address %s.", local_engine_count, handshake_address)
 
     # Create the engines.
     engine_manager = CoreEngineProcManager(
@@ -129,7 +153,7 @@ def run_headless(args: argparse.Namespace):
         local_start_index=0,
         vllm_config=vllm_config,
         on_head_node=False,
-        input_address=input_address,
+        handshake_address=handshake_address,
         executor_class=Executor.get_class(vllm_config),
         log_stats=not engine_args.disable_log_stats,
     )
@@ -139,3 +163,166 @@ def run_headless(args: argparse.Namespace):
     finally:
         logger.info("Shutting down.")
         engine_manager.close()
+
+
+def run_multi_api_server(args: argparse.Namespace):
+
+    assert not args.headless
+    num_api_servers = args.api_server_count
+    assert num_api_servers > 0
+
+    if num_api_servers > 1:
+        setup_multiprocess_prometheus()
+
+    listen_address, sock = setup_server(args)
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+    model_config = vllm_config.model_config
+
+    if num_api_servers > 1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("api_server_count > 1 is only supported for V1")
+
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+            raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
+                             "with api_server_count > 1")
+
+        if model_config.is_multimodal_model and not (
+                model_config.disable_mm_preprocessor_cache):
+            logger.warning(
+                "Multi-model preprocessor cache will be disabled for"
+                " api_server_count > 1")
+            model_config.disable_mm_preprocessor_cache = True
+
+    parallel_config = vllm_config.parallel_config
+
+    assert parallel_config.data_parallel_rank == 0
+
+    dp_size = parallel_config.data_parallel_size
+    local_engine_count = parallel_config.data_parallel_size_local
+    host = parallel_config.data_parallel_master_ip
+    local_only = local_engine_count == dp_size
+
+    # Set up input and output addresses.
+    input_addresses = [
+        get_engine_client_zmq_addr(local_only, host)
+        for _ in range(num_api_servers)
+    ]
+    output_addresses = [
+        get_engine_client_zmq_addr(local_only, host)
+        for _ in range(num_api_servers)
+    ]
+
+    addresses = EngineZmqAddresses(
+        inputs=input_addresses,
+        outputs=output_addresses,
+    )
+
+    # Set up coordinator for dp > 1.
+    coordinator = None
+    stats_update_address = None
+    if dp_size > 1:
+        coordinator = DPCoordinator(parallel_config)
+        addresses.coordinator_input, addresses.coordinator_output = (
+            coordinator.get_engine_socket_addresses())
+        stats_update_address = coordinator.get_stats_publish_address()
+        logger.info("Started DP Coordinator process (PID: %d)",
+                    coordinator.proc.pid)
+
+    if parallel_config.data_parallel_backend == "ray":
+        logger.info("Starting ray-based data parallel backend")
+
+        engine_actor_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=Executor.get_class(vllm_config),
+            log_stats=not engine_args.disable_log_stats,
+        )
+        # Start API servers using the manager
+        api_server_manager = APIServerProcessManager(
+            target_server_fn=run_api_server_worker_proc,
+            listen_address=listen_address,
+            sock=sock,
+            args=args,
+            num_servers=num_api_servers,
+            input_addresses=input_addresses,
+            output_addresses=output_addresses,
+            stats_update_address=stats_update_address)
+
+        wait_for_completion_or_failure(api_server_manager=api_server_manager,
+                                       engine_manager=engine_actor_manager,
+                                       coordinator=coordinator)
+        return
+
+    handshake_address = get_engine_client_zmq_addr(
+        local_only, host, parallel_config.data_parallel_rpc_port)
+
+    with zmq_socket_ctx(handshake_address, zmq.ROUTER,
+                        bind=True) as handshake_socket:
+
+        # Start local engines.
+        if not local_engine_count:
+            local_engine_manager = None
+        else:
+            local_engine_manager = CoreEngineProcManager(
+                EngineCoreProc.run_engine_core,
+                vllm_config=vllm_config,
+                executor_class=Executor.get_class(vllm_config),
+                log_stats=not engine_args.disable_log_stats,
+                handshake_address=handshake_address,
+                on_head_node=True,
+                local_engine_count=local_engine_count,
+                start_index=0,
+                local_start_index=0)
+
+        # Start API servers using the manager
+        api_server_manager = APIServerProcessManager(
+            target_server_fn=run_api_server_worker_proc,
+            listen_address=listen_address,
+            sock=sock,
+            args=args,
+            num_servers=num_api_servers,
+            input_addresses=input_addresses,
+            output_addresses=output_addresses,
+            stats_update_address=stats_update_address)
+
+        # Wait for engine handshakes to complete.
+        core_engines = [
+            CoreEngine(index=i, local=(i < local_engine_count))
+            for i in range(dp_size)
+        ]
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            core_engines,
+            parallel_config,
+            vllm_config.cache_config,
+            local_engine_manager,
+            coordinator.proc if coordinator else None,
+        )
+
+        # Wait for API servers
+        wait_for_completion_or_failure(api_server_manager=api_server_manager,
+                                       engine_manager=local_engine_manager,
+                                       coordinator=coordinator)
+
+
+def run_api_server_worker_proc(listen_address,
+                               sock,
+                               args,
+                               client_config=None,
+                               **uvicorn_kwargs) -> None:
+    """Entrypoint for individual API server worker processes."""
+
+    # Add process-specific prefix to stdout and stderr.
+    from multiprocessing import current_process
+    process_name = current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+
+    uvloop.run(
+        run_server_worker(listen_address, sock, args, client_config,
+                          **uvicorn_kwargs))
diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py
index f739a68c5f4c9ecee3ea329f9d3024ee45b57caf..0a724431297586dbe2161e1ac59706affd617ff4 100644
--- a/vllm/entrypoints/cli/types.py
+++ b/vllm/entrypoints/cli/types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index a4f70a51ebaf34beaad928ffa140d0dda250abcf..9f4dc19fb4ab7cc4ffa5c8c1458af7fc0647b50f 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import signal
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 59cc44eb0e185e6503aa1a601025ff6916c984f4..6e3cb18fc559586bc1ac246aa5f876fb10e02c0e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import warnings
@@ -45,8 +46,7 @@ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
-                        is_list_of)
+from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -143,12 +143,6 @@ class LLM:
     DEPRECATE_LEGACY: ClassVar[bool] = True
     """A flag to toggle whether to deprecate the legacy generate/encode API."""
 
-    DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
-    """
-    A flag to toggle whether to deprecate positional arguments in
-    [LLM.__init__][].
-    """
-
     @classmethod
     @contextmanager
     def deprecate_legacy_api(cls):
@@ -158,16 +152,11 @@ class LLM:
 
         cls.DEPRECATE_LEGACY = False
 
-    @deprecate_args(
-        start_index=2,  # Ignore self and model
-        is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS,
-        additional_message=(
-            "All positional arguments other than `model` will be "
-            "replaced with keyword arguments in an upcoming version."),
-    )
     def __init__(
         self,
         model: str,
+        *,
+        task: TaskOption = "auto",
         tokenizer: Optional[str] = None,
         tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
@@ -189,8 +178,6 @@ class LLM:
         hf_token: Optional[Union[bool, str]] = None,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        # After positional args are removed, move this right below `model`
-        task: TaskOption = "auto",
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any]]] = None,
         **kwargs,
@@ -207,6 +194,9 @@ class LLM:
             if isinstance(worker_cls, type):
                 kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
 
+        if hf_overrides is None:
+            hf_overrides = {}
+
         if compilation_config is not None:
             if isinstance(compilation_config, int):
                 compilation_config_instance = CompilationConfig(
@@ -218,7 +208,7 @@ class LLM:
             else:
                 compilation_config_instance = compilation_config
         else:
-            compilation_config_instance = None
+            compilation_config_instance = CompilationConfig()
 
         engine_args = EngineArgs(
             model=model,
@@ -291,7 +281,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         *,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -307,7 +297,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -323,7 +313,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -340,7 +330,7 @@ class LLM:
                                         list[SamplingParams]]] = None,
         *,
         prompt_token_ids: list[int],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -357,7 +347,7 @@ class LLM:
                                         list[SamplingParams]]] = None,
         *,
         prompt_token_ids: list[list[int]],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -372,7 +362,7 @@ class LLM:
         prompts: None,
         sampling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -392,7 +382,7 @@ class LLM:
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -414,7 +404,10 @@ class LLM:
                 When it is a single value, it is applied to every prompt.
                 When it is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -519,10 +512,27 @@ class LLM:
         executor = self.llm_engine.model_executor
         return executor.apply_model(func)
 
+    def _get_beam_search_lora_requests(
+        self,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
+        prompts: list[Union[TokensPrompt, TextPrompt]],
+    ) -> list[Optional[LoRARequest]]:
+        """Get the optional lora request corresponding to each prompt."""
+        if isinstance(lora_request,
+                      Sequence) and len(lora_request) != len(prompts):
+            raise ValueError(
+                "Lora request list should be the same length as the prompts")
+
+        if lora_request is None or isinstance(lora_request, LoRARequest):
+            return [lora_request] * len(prompts)
+
+        raise TypeError(f"Invalid lora_request type {type(lora_request)}")
+
     def beam_search(
         self,
         prompts: list[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -531,6 +541,7 @@ class LLM:
             prompts: A list of prompts. Each prompt can be a string or a list
                 of token IDs.
             params: The beam search parameters.
+            lora_request: LoRA request to use for generation, if any.
         """
         # TODO: how does beam search work together with length penalty,
         # frequency, penalty, and stopping criteria, etc.?
@@ -540,6 +551,9 @@ class LLM:
         ignore_eos = params.ignore_eos
         length_penalty = params.length_penalty
 
+        lora_requests = self._get_beam_search_lora_requests(
+            lora_request, prompts)
+
         def sort_beams_key(x: BeamSearchSequence) -> float:
             return get_beam_search_score(x.tokens, x.cum_logprob,
                                          tokenizer.eos_token_id,
@@ -567,7 +581,7 @@ class LLM:
                                             temperature=temperature)
         instances: list[BeamSearchInstance] = []
 
-        for prompt in prompts:
+        for lora_req, prompt in zip(lora_requests, prompts):
             # Add multimodal processor kwargs & data
             mm_kwargs = {}
             if "multi_modal_data" in prompt:
@@ -583,7 +597,12 @@ class LLM:
                 prompt_tokens = tokenizer.encode(prompt["prompt"])
 
             instances.append(
-                BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs))
+                BeamSearchInstance(
+                    prompt_tokens,
+                    lora_request=lora_req,
+                    logprobs=None,
+                    **mm_kwargs,
+                ), )
 
         for _ in range(max_tokens):
             all_beams: list[BeamSearchSequence] = list(
@@ -597,15 +616,17 @@ class LLM:
             if len(all_beams) == 0:
                 break
 
-            prompts_batch = [
-                create_tokens_prompt_from_beam(beam) for beam in all_beams
-            ]
+            # create the corresponding batch entries for prompt & optional lora
+            prompts_batch, lora_req_batch = zip(
+                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
+                  for beam in all_beams])
 
             # only runs for one step
             # we don't need to use tqdm here
             output = self.generate(prompts_batch,
                                    sampling_params=beam_search_params,
-                                   use_tqdm=False)
+                                   use_tqdm=False,
+                                   lora_request=lora_req_batch)
 
             for (start, end), instance in zip(instance_start_and_end,
                                               instances):
@@ -623,6 +644,7 @@ class LLM:
                             new_beam = BeamSearchSequence(
                                 tokens=current_beam.tokens + [token_id],
                                 logprobs=current_beam.logprobs + [logprobs],
+                                lora_request=current_beam.lora_request,
                                 cum_logprob=current_beam.cum_logprob +
                                 logprob_obj.logprob,
                                 multi_modal_data=current_beam.multi_modal_data,
@@ -659,7 +681,7 @@ class LLM:
                         list[list[ChatCompletionMessageParam]]],
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
@@ -690,7 +712,10 @@ class LLM:
                 is a single value, it is applied to every prompt. When it
                 is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             chat_template: The template to use for structuring the chat.
                 If not provided, the model's default chat template will be used.
@@ -804,7 +829,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -819,7 +844,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -834,7 +859,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -850,7 +875,7 @@ class LLM:
         *,
         prompt_token_ids: list[int],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -866,7 +891,7 @@ class LLM:
         *,
         prompt_token_ids: list[list[int]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -880,7 +905,7 @@ class LLM:
         pooling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -899,7 +924,7 @@ class LLM:
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -916,7 +941,10 @@ class LLM:
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -986,7 +1014,7 @@ class LLM:
         /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
@@ -1005,7 +1033,10 @@ class LLM:
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1032,7 +1063,7 @@ class LLM:
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
         *,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ClassificationRequestOutput]:
@@ -1047,7 +1078,10 @@ class LLM:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1073,7 +1107,7 @@ class LLM:
         text_1: list[Union[str, TextPrompt, TokensPrompt]],
         text_2: list[Union[str, TextPrompt, TokensPrompt]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1107,7 +1141,7 @@ class LLM:
         text_1: list[str],
         text_2: list[str],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1159,7 +1193,7 @@ class LLM:
         /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1179,7 +1213,10 @@ class LLM:
             text_2: The texts to pair with the query to form the input
                 to the LLM. See [PromptType][vllm.inputs.PromptType] for
                 more details about the format of each prompts.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1360,7 +1397,7 @@ class LLM:
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
         *,
-        use_tqdm: bool,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         prompt_adapter_request: Optional[PromptAdapterRequest],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -1398,7 +1435,8 @@ class LLM:
         # Add requests to the engine.
         it = prompts
         if use_tqdm:
-            it = tqdm(it, desc="Adding requests")
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            it = tqdm_func(it, desc="Adding requests")
 
         for i, prompt in enumerate(it):
             self._add_request(
@@ -1455,12 +1493,15 @@ class LLM:
         return params
 
     def _run_engine(
-            self, *, use_tqdm: bool
+        self,
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True
     ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            pbar = tqdm_func(
                 total=num_requests,
                 desc="Processed prompts",
                 dynamic_ncols=True,
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index d4655dd5e6ab865995f1ead7732dde6b6020aca9..f3aee188dae9463e884b607026e7be9bbeca71d8 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2da89b4f5944cd60e6f356dd75380cdc275b4cad..62f1c6a7c12bff90b1567e2490913f4de0c7a447 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import atexit
 import gc
 import importlib
 import inspect
+import json
 import multiprocessing
 import os
 import signal
@@ -16,8 +18,7 @@ from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from json import JSONDecodeError
-from typing import Annotated, Optional, Union
+from typing import Annotated, Any, Optional
 
 import prometheus_client
 import regex as re
@@ -26,6 +27,8 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import State
 from starlette.routing import Mount
@@ -59,9 +62,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
                                               EmbeddingRequest,
-                                              EmbeddingResponse,
-                                              EmbeddingResponseData,
-                                              ErrorResponse,
+                                              EmbeddingResponse, ErrorResponse,
                                               LoadLoRAAdapterRequest,
                                               PoolingChatRequest,
                                               PoolingCompletionRequest,
@@ -99,10 +100,9 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
+from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
-TIMEOUT_KEEP_ALIVE = 5  # seconds
-
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
 
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
@@ -144,14 +144,17 @@ async def lifespan(app: FastAPI):
 
 @asynccontextmanager
 async def build_async_engine_client(
-        args: Namespace) -> AsyncIterator[EngineClient]:
+    args: Namespace,
+    client_config: Optional[dict[str, Any]] = None,
+) -> AsyncIterator[EngineClient]:
 
     # Context manager to handle engine_client lifecycle
     # Ensures everything is shutdown and cleaned up on error/exit
     engine_args = AsyncEngineArgs.from_cli_args(args)
 
     async with build_async_engine_client_from_engine_args(
-            engine_args, args.disable_frontend_multiprocessing) as engine:
+            engine_args, args.disable_frontend_multiprocessing,
+            client_config) as engine:
         yield engine
 
 
@@ -159,6 +162,7 @@ async def build_async_engine_client(
 async def build_async_engine_client_from_engine_args(
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
+    client_config: Optional[dict[str, Any]] = None,
 ) -> AsyncIterator[EngineClient]:
     """
     Create EngineClient, either:
@@ -181,12 +185,16 @@ async def build_async_engine_client_from_engine_args(
 
         from vllm.v1.engine.async_llm import AsyncLLM
         async_llm: Optional[AsyncLLM] = None
+        client_index = client_config.pop(
+            "client_index") if client_config else 0
         try:
             async_llm = AsyncLLM.from_vllm_config(
                 vllm_config=vllm_config,
                 usage_context=usage_context,
                 disable_log_requests=engine_args.disable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats)
+                disable_log_stats=engine_args.disable_log_stats,
+                client_addresses=client_config,
+                client_index=client_index)
 
             # Don't keep the dummy data in memory
             await async_llm.reset_mm_cache()
@@ -320,22 +328,9 @@ class PrometheusResponse(Response):
 
 
 def mount_metrics(app: FastAPI):
-    # Lazy import for prometheus multiprocessing.
-    # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-    # before prometheus_client is imported.
-    # See https://prometheus.github.io/client_python/multiprocess/
-    from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app,
-                                   multiprocess)
-    from prometheus_fastapi_instrumentator import Instrumentator
-
-    registry = REGISTRY
-
-    prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
-    if prometheus_multiproc_dir_path is not None:
-        logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
-                     prometheus_multiproc_dir_path)
-        registry = CollectorRegistry()
-        multiprocess.MultiProcessCollector(registry)
+    """Mount prometheus metrics to a FastAPI app."""
+
+    registry = get_prometheus_registry()
 
     # `response_class=PrometheusResponse` is needed to return an HTTP response
     # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
@@ -627,37 +622,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
-        fallback_handler = pooling(raw_request)
-        if fallback_handler is None:
-            return base(raw_request).create_error_response(
-                message="The model does not support Embeddings API")
+        return base(raw_request).create_error_response(
+            message="The model does not support Embeddings API")
 
-        logger.warning(
-            "Embeddings API will become exclusive to embedding models "
-            "in a future release. To return the hidden states directly, "
-            "use the Pooling API (`/pooling`) instead.")
-
-        res = await fallback_handler.create_pooling(request, raw_request)
-
-        generator: Union[ErrorResponse, EmbeddingResponse]
-        if isinstance(res, PoolingResponse):
-            generator = EmbeddingResponse(
-                id=res.id,
-                object=res.object,
-                created=res.created,
-                model=res.model,
-                data=[
-                    EmbeddingResponseData(
-                        index=d.index,
-                        embedding=d.data,  # type: ignore
-                    ) for d in res.data
-                ],
-                usage=res.usage,
-            )
-        else:
-            generator = res
-    else:
-        generator = await handler.create_embedding(request, raw_request)
+    generator = await handler.create_embedding(request, raw_request)
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -961,7 +929,7 @@ async def invocations(raw_request: Request):
     """
     try:
         body = await raw_request.json()
-    except JSONDecodeError as e:
+    except json.JSONDecodeError as e:
         raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
                             detail=f"JSON decode error: {e}") from e
 
@@ -1034,6 +1002,18 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
         return Response(status_code=200, content=response)
 
 
+def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
+    if not log_config_file:
+        return None
+    try:
+        with open(log_config_file) as f:
+            return json.load(f)
+    except Exception as e:
+        logger.warning("Failed to load log config from file %s: error %s",
+                       log_config_file, e)
+        return None
+
+
 def build_app(args: Namespace) -> FastAPI:
     if args.disable_fastapi_docs:
         app = FastAPI(openapi_url=None,
@@ -1285,16 +1265,10 @@ def create_server_socket(addr: tuple[str, int]) -> socket.socket:
     return sock
 
 
-async def run_server(args, **uvicorn_kwargs) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    log_non_default_args(args)
-
-    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
-        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
-
+def validate_api_server_args(args):
     valid_tool_parses = ToolParserManager.tool_parsers.keys()
     if args.enable_auto_tool_choice \
-        and args.tool_call_parser not in valid_tool_parses:
+            and args.tool_call_parser not in valid_tool_parses:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
@@ -1305,6 +1279,19 @@ async def run_server(args, **uvicorn_kwargs) -> None:
             f"invalid reasoning parser: {args.reasoning_parser} "
             f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
 
+
+def setup_server(args):
+    """Validate API server args, set up signal handler, create socket
+    ready to serve."""
+
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    log_non_default_args(args)
+
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    validate_api_server_args(args)
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
@@ -1321,22 +1308,46 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
-    async with build_async_engine_client(args) as engine_client:
+    addr, port = sock_addr
+    is_ssl = args.ssl_keyfile and args.ssl_certfile
+    host_part = f"[{addr}]" if is_valid_ipv6_address(
+        addr) else addr or "0.0.0.0"
+    listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
+
+    return listen_address, sock
+
+
+async def run_server(args, **uvicorn_kwargs) -> None:
+    """Run a single-worker API server."""
+    listen_address, sock = setup_server(args)
+    await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+
+
+async def run_server_worker(listen_address,
+                            sock,
+                            args,
+                            client_config=None,
+                            **uvicorn_kwargs) -> None:
+    """Run a single API server worker."""
+
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    server_index = client_config.get("client_index", 0) if client_config else 0
+
+    # Load logging config for uvicorn if specified
+    log_config = load_log_config(args.log_config_file)
+    if log_config is not None:
+        uvicorn_kwargs['log_config'] = log_config
+
+    async with build_async_engine_client(args, client_config) as engine_client:
         app = build_app(args)
 
         vllm_config = await engine_client.get_vllm_config()
         await init_app_state(engine_client, vllm_config, app.state, args)
 
-        def _listen_addr(a: str) -> str:
-            if is_valid_ipv6_address(a):
-                return '[' + a + ']'
-            return a or "0.0.0.0"
-
-        is_ssl = args.ssl_keyfile and args.ssl_certfile
-        logger.info("Starting vLLM API server on http%s://%s:%d",
-                    "s" if is_ssl else "", _listen_addr(sock_addr[0]),
-                    sock_addr[1])
-
+        logger.info("Starting vLLM API server %d on %s", server_index,
+                    listen_address)
         shutdown_task = await serve_http(
             app,
             sock=sock,
@@ -1347,7 +1358,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
             # NOTE: When the 'disable_uvicorn_access_log' value is True,
             # no access log will be output.
             access_log=not args.disable_uvicorn_access_log,
-            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+            timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
             ssl_keyfile=args.ssl_keyfile,
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index d01af5e4226660dfd5c2579ecc109d1a16a415b2..ca70e78df32609b0db0e48aea504a9c4a5701510 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the command line arguments for the vLLM's
 OpenAI-compatible server. It is kept in a separate file for documentation
@@ -11,6 +12,7 @@ import ssl
 from collections.abc import Sequence
 from typing import Optional, Union, get_args
 
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
@@ -243,6 +245,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         " into OpenAI API format, the name register in this plugin can be used "
         "in ``--tool-call-parser``.")
 
+    parser.add_argument(
+        "--log-config-file",
+        type=str,
+        default=envs.VLLM_LOGGING_CONFIG_PATH,
+        help="Path to logging config JSON file for both vllm and uvicorn",
+    )
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
     parser.add_argument('--max-log-len',
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 04d5091a968117d56a62f54e97b5e5a1ec0555a1..29d72256cf70bba33b1a49530c97fb34760bf83f 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from functools import lru_cache, partial
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 393cf381b16b73c3329463212baa20d6be4ea29b..79f0f200c74edb453154f5ffe8d406b3af42d757 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
@@ -175,11 +176,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
 
 
+# extra="forbid" is a workaround to have kwargs as a field,
+# see https://github.com/pydantic/pydantic/issues/3125
 class LogitsProcessorConstructor(BaseModel):
     qualname: str
     args: Optional[list[Any]] = None
     kwargs: Optional[dict[str, Any]] = None
 
+    model_config = ConfigDict(extra="forbid")
+
 
 LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
 
@@ -234,7 +239,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     presence_penalty: Optional[float] = 0.0
     response_format: Optional[AnyResponseFormat] = None
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
+    stop: Optional[Union[str, list[str]]] = []
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = None
@@ -258,7 +263,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_p: Optional[float] = None
     repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = Field(default_factory=list)
+    stop_token_ids: Optional[list[int]] = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
@@ -266,6 +271,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     prompt_logprobs: Optional[int] = None
+    allowed_token_ids: Optional[list[int]] = None
     # --8<-- [end:chat-completion-sampling-params]
 
     # --8<-- [start:chat-completion-extra-params]
@@ -544,6 +550,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
             extra_args=({"kv_transfer_params": self.kv_transfer_params}
                         if self.kv_transfer_params else None))
 
@@ -756,7 +763,7 @@ class CompletionRequest(OpenAIBaseModel):
     n: int = 1
     presence_penalty: Optional[float] = 0.0
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
+    stop: Optional[Union[str, list[str]]] = []
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
@@ -770,7 +777,7 @@ class CompletionRequest(OpenAIBaseModel):
     min_p: Optional[float] = None
     repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = Field(default_factory=list)
+    stop_token_ids: Optional[list[int]] = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
@@ -1477,6 +1484,10 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
+BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
+                              ScoreRequest, RerankRequest]
+
+
 class BatchRequestInput(OpenAIBaseModel):
     """
     The per-line object of the batch input file.
@@ -1497,21 +1508,22 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameters of the request.
-    body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
+    body: BatchRequestInputBody
 
     @field_validator('body', mode='plain')
     @classmethod
     def check_type_for_url(cls, value: Any, info: ValidationInfo):
         # Use url to disambiguate models
-        url = info.data['url']
+        url: str = info.data["url"]
         if url == "/v1/chat/completions":
             return ChatCompletionRequest.model_validate(value)
         if url == "/v1/embeddings":
             return TypeAdapter(EmbeddingRequest).validate_python(value)
-        if url == "/v1/score":
+        if url.endswith("/score"):
             return ScoreRequest.model_validate(value)
-        return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
-                                 ScoreRequest]).validate_python(value)
+        if url.endswith("/rerank"):
+            return RerankRequest.model_validate(value)
+        return TypeAdapter(BatchRequestInputBody).validate_python(value)
 
 
 class BatchResponseData(OpenAIBaseModel):
@@ -1523,7 +1535,7 @@ class BatchResponseData(OpenAIBaseModel):
 
     # The body of the response.
     body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
-                         ScoreResponse]] = None
+                         ScoreResponse, RerankResponse]] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
@@ -1554,6 +1566,11 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
             "If true (the default), special tokens (e.g. BOS) will be added to "
             "the prompt."),
     )
+    return_token_strs: Optional[bool] = Field(
+        default=False,
+        description=("If true, also return the token strings "
+                     "corresponding to the token ids."),
+    )
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
@@ -1567,6 +1584,11 @@ class TokenizeChatRequest(OpenAIBaseModel):
          "This is a parameter used by chat template in tokenizer config of the "
          "model."),
     )
+    return_token_strs: Optional[bool] = Field(
+        default=False,
+        description=("If true, also return the token strings "
+                     "corresponding to the token ids."),
+    )
     continue_final_message: bool = Field(
         default=False,
         description=
@@ -1624,6 +1646,7 @@ class TokenizeResponse(OpenAIBaseModel):
     count: int
     max_model_len: int
     tokens: list[int]
+    token_strs: Optional[list[str]] = None
 
 
 class DetokenizeRequest(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index eae83c9a494a03bd512081389070df018ecf8629..9994b3cae888815b8ead37e17e88d18d944400cb 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import tempfile
@@ -21,7 +22,7 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchResponseData,
                                               ChatCompletionResponse,
                                               EmbeddingResponse, ErrorResponse,
-                                              ScoreResponse)
+                                              RerankResponse, ScoreResponse)
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@@ -33,9 +34,7 @@ from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="vLLM OpenAI-Compatible batch runner.")
+def make_arg_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "-i",
         "--input-file",
@@ -98,7 +97,13 @@ def parse_args():
         default=False,
         help="If set to True, enable prompt_tokens_details in usage.")
 
-    return parser.parse_args()
+    return parser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="vLLM OpenAI-Compatible batch runner.")
+    return make_arg_parser(parser).parse_args()
 
 
 # explicitly use pure text format, with a newline at the end
@@ -270,8 +275,11 @@ async def run_request(serving_engine_func: Callable,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
-    if isinstance(response,
-                  (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
+    if isinstance(
+            response,
+        (ChatCompletionResponse, EmbeddingResponse, ScoreResponse,
+         RerankResponse),
+    ):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
@@ -393,7 +401,7 @@ async def main(args):
             response_futures.append(
                 run_request(embed_handler_fn, request, tracker))
             tracker.submitted()
-        elif request.url == "/v1/score":
+        elif request.url.endswith("/score"):
             score_handler_fn = openai_serving_scores.create_score if \
                 openai_serving_scores is not None else None
             if score_handler_fn is None:
@@ -407,13 +415,29 @@ async def main(args):
             response_futures.append(
                 run_request(score_handler_fn, request, tracker))
             tracker.submitted()
+        elif request.url.endswith("/rerank"):
+            rerank_handler_fn = openai_serving_scores.do_rerank if \
+                openai_serving_scores is not None else None
+            if rerank_handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Rerank API",
+                    ))
+                continue
+
+            response_futures.append(
+                run_request(rerank_handler_fn, request, tracker))
+            tracker.submitted()
         else:
             response_futures.append(
                 make_async_error_request_output(
                     request,
-                    error_msg=
-                    "Only /v1/chat/completions, /v1/embeddings, and /v1/score "
-                    "are supported in the batch endpoint.",
+                    error_msg=f"URL {request.url} was used. "
+                    "Supported endpoints: /v1/chat/completions, /v1/embeddings,"
+                    " /score, /rerank ."
+                    "See vllm/entrypoints/openai/api_server.py for supported "
+                    "score/rerank versions.",
                 ))
 
     with tracker.pbar():
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index bc11686d7be899f41eca70fa47e2970ed9b536a9..79eac184a212907f9b555c902157813ef8ff4bc6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
@@ -236,6 +237,7 @@ class OpenAIServingChat(OpenAIServing):
                         prompt=engine_prompt,
                         request_id=request_id,
                         params=sampling_params,
+                        lora_request=lora_request,
                     )
                 else:
                     generator = self.engine_client.generate(
@@ -318,10 +320,13 @@ class OpenAIServingChat(OpenAIServing):
     def extract_tool_call_required_streaming(
         self,
         previous_text: str,
-        current_text: str,
+        current_text: Optional[str],
         delta_text: str,
         function_name_returned: bool,
     ) -> tuple[Optional[DeltaMessage], bool]:
+        if current_text is None or current_text == "":
+            # if the current text is empty, we cannot parse it
+            return None, function_name_returned
         try:
             obj = partial_json_parser.loads(current_text)
         except partial_json_parser.core.exceptions.MalformedJSON:
@@ -648,10 +653,18 @@ class OpenAIServingChat(OpenAIServing):
                         current_text = previous_text + delta_text
                         fn_name_returned = function_name_returned[i]
 
+                        if self.reasoning_parser:
+                            _, content = \
+                                reasoning_parser.extract_reasoning_content(
+                                    current_text,
+                                    request
+                                )
+                        else:
+                            content = current_text
                         delta_message, function_name_returned[i] = (
                             self.extract_tool_call_required_streaming(
                                 previous_text=previous_text,
-                                current_text=current_text,
+                                current_text=content,
                                 delta_text=delta_text,
                                 function_name_returned=fn_name_returned))
 
@@ -676,7 +689,21 @@ class OpenAIServingChat(OpenAIServing):
                                     current_token_ids,
                                     output.token_ids,
                                 ))
-
+                            # When encountering think end id in prompt_token_ids
+                            # i.e {"enable_thinking": False},
+                            # set reasoning status to end.
+                            # Remove the text and token ids related
+                            # to 'reasoning_content'.
+                            if res.prompt_token_ids and \
+                                reasoning_parser.is_reasoning_end(
+                                    list(res.prompt_token_ids)):
+                                reasoning_end_arr[i] = True
+                                current_token_ids = list(output.token_ids)
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
                             # When encountering think end id in delta_token_ids,
                             # set reasoning status to end.
                             # Remove the text and token ids related
@@ -979,15 +1006,17 @@ class OpenAIServingChat(OpenAIServing):
 
                 # the fields of FunctionDefinition are a superset of the
                 # tool call outputs and can be used for parsing
+                assert content is not None
                 tool_calls = TypeAdapter(
-                    list[FunctionDefinition]).validate_json(output.text)
+                    list[FunctionDefinition]).validate_json(content)
                 message = ChatMessage(
                     role=role,
                     content="",
                     tool_calls=[
                         tool_call_class(function=FunctionCall(
                             name=tool_call.name,
-                            arguments=json.dumps(tool_call.parameters)))
+                            arguments=json.dumps(tool_call.parameters,
+                                                 ensure_ascii=False)))
                         for tool_call in tool_calls
                     ])
 
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 90cdd389d59f08eb79e48dd27e49b93a8b8ab205..3ac4f01ea6028f08e41a179dd4be54b6e5b5489c 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
 from typing import Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 7beaae287de99b4375c1bb78f20e3d6912360e7a..ce5eca8550289121b675f213a1c66e81cc2a7143 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import time
@@ -186,6 +187,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         prompt=engine_prompt,
                         request_id=request_id,
                         params=sampling_params,
+                        lora_request=lora_request,
                     )
                 else:
                     generator = self.engine_client.generate(
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 3785d2642f9d965cfc7d530fbe0434d9e6696619..e87decfe636acebee43f8f65969d4e8fc8e25980 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from typing import Final, Literal, Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c73575b48d9c9e821367fa6d268ba1be755074c1..ac3883bdeb33c1d91b399b832263848ad025f38b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
 import json
@@ -134,11 +135,9 @@ class RequestProcessingMixin(BaseModel):
     Mixin for request processing, 
     handling prompt preparation and engine input.
     """
-    request_prompts: Optional[Sequence[RequestPrompt]] = \
-                            Field(default_factory=list)
+    request_prompts: Optional[Sequence[RequestPrompt]] = []
     engine_prompts: Optional[Union[list[EngineTokensPrompt],
-                                   list[EngineEmbedsPrompt]]] = Field(
-                                       default_factory=list)
+                                   list[EngineEmbedsPrompt]]] = []
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -528,12 +527,14 @@ class OpenAIServing:
         if isinstance(request,
                       (EmbeddingChatRequest, EmbeddingCompletionRequest,
                        ScoreRequest, RerankRequest, ClassificationRequest)):
-            operation = {
-                ScoreRequest: "score",
-                ClassificationRequest: "classification"
-            }.get(type(request), "embedding generation")
 
             if token_num > self.max_model_len:
+                operations: dict[type[AnyRequest], str] = {
+                    ScoreRequest: "score",
+                    ClassificationRequest: "classification"
+                }
+                operation = operations.get(type(request),
+                                           "embedding generation")
                 raise ValueError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 74433a1a3c3f5703188e00b09dbf7d9a1d82cfc6..764b0e73690de1c167004661fcebd43633930dbc 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import pathlib
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 7c401d4f5cb142866df5e6a37bd58f33ae0a0ba5..b896cc46b9d0801319e3dc0e4ac4743aea1bd7db 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import base64
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9bdacb5518d6a9889fa9718e2205bf27c7e99fd5..f58611c49b88ca796abecf3e1823072cfc721e1c 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Mapping
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 5ef1a486d86c8febd479d4903a4813812d182e31..3db0a71fadd153f378fafb5867f364a81a555400 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Final, Optional, Union
 
@@ -110,7 +111,12 @@ class OpenAIServingTokenization(OpenAIServing):
                           dict) and "prompt_token_ids" in engine_prompt:
                 input_ids.extend(engine_prompt["prompt_token_ids"])
 
+        token_strs = None
+        if request.return_token_strs:
+            token_strs = tokenizer.convert_ids_to_tokens(input_ids)
+
         return TokenizeResponse(tokens=input_ids,
+                                token_strs=token_strs,
                                 count=len(input_ids),
                                 max_model_len=self.max_model_len)
 
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 13565d0ef8dd72c0169e18128cc5d881e6525946..f667c7e9b3a96b1a3d71ab5124708a03bdbcf226 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import io
 import time
@@ -278,7 +279,9 @@ class OpenAIServingTranscription(OpenAIServing):
 
         result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None
         try:
-            # TODO(rob): subtract len of tokenized prompt.
+            # Unlike most decoder-only models, whisper generation length is not
+            # constrained by the size of the input audio, which is mapped to a
+            # fixed-size log-mel-spectogram.
             default_max_tokens = self.model_config.max_model_len
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params)
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 054c0b006b2fccfe05b8035bbcfc76ede8cde52c..3e4f4e149c9f440aabbaa219aea48b74bd176e25 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .deepseekv3_tool_parser import DeepSeekV3ToolParser
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 931d5aab9bd9de9654bc37e8d3a2b0400c4a40a1..02aeab6136316b1a5888e3e617213c0c55d15df0 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index 14e743e13a7276d561e19d5ea4427487ba92609b..60025af2a6f334b0db7b8025c70b4649c27f30bf 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Union
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 383e0d44de99fa916b169e806569a89172fb6898..5508ba6a3940841e5661fba34c9df065c105a0db 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index b8bf142530ee3e2ab775195c98c206019887dc70..fcc5b7edda83f1ffdb58d63e7a63935abbbda987 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 2b9f9852bcb32fcedc7df8b301d416d538a92450..c7030d34d453eebe52a3f387aa8f7edc0816282e 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 3f2799f8010a563afdbc15e1bc9a7d77666234f0..e5dcdf9a07602754f90a321aa09b878af9b90174 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 2714a545f997f2a1f03a543e421413aa2b2ca458..66b483d8b0f66084bd9172365f00bcebc7eaf660 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 858c8db99fd29b76ce67437acdb9cdb9a11e79bd..6bf44a4345a9d94cee7fc101156298432f805496 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import json
 from collections.abc import Sequence
@@ -7,6 +8,7 @@ from typing import Any, Union
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -64,7 +66,19 @@ class Llama4PythonicToolParser(ToolParser):
         if model_output.startswith("<|python_start|>"):
             model_output = model_output[len("<|python_start|>"):]
             model_output = model_output.replace("<|python_end|>", "")
-        if not (self.TOOL_CALL_REGEX.match(model_output)):
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 4eda7044cbbafd4bd1f87d128576eb605e0d327c..5698bc70af23bbd87ccaac494394b0fb911f1607 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index fecad7e653abc94f07549fe8822c2d6c7f4442a4..ab1cfd4b6eabeb226537c781f878e48b68f54ef6 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
@@ -43,11 +44,17 @@ class MistralToolCall(ToolCall):
         return id.isalnum() and len(id) == 9
 
 
+def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+    return isinstance(model_tokenizer, MistralTokenizer) \
+        and model_tokenizer.version >= 11
+
+
 @ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):
     """
-    Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
-    examples/tool_chat_template_mistral.jinja template.
+    Tool call parser for Mistral 7B Instruct v0.3, intended for use with
+    - [`mistral_common`](https://github.com/mistralai/mistral-common/)
+    - the examples/tool_chat_template_mistral.jinja template.
 
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
@@ -69,6 +76,12 @@ class MistralToolParser(ToolParser):
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+        if _is_fn_name_regex_support(self.model_tokenizer):
+            self.fn_name_regex = re.compile(r'([a-zA-Z0-9_-]+)(\{.*?\})',
+                                            re.DOTALL)
+        else:
+            self.fn_name_regex = None
+
         if self.bot_token_id is None:
             raise RuntimeError(
                 "Mistral Tool Parser could not locate the tool call token in "
@@ -108,11 +121,25 @@ class MistralToolParser(ToolParser):
         tool_content = model_output.replace(self.bot_token, "").strip()
 
         try:
-
             # we first try to directly load the json as parsing very nested
             # jsons is difficult
             try:
-                function_call_arr = json.loads(tool_content)
+                if self.fn_name_regex:
+                    matches = self.fn_name_regex.findall(tool_content)
+
+                    function_call_arr = []
+                    for match in matches:
+                        fn_name = match[0]
+                        args = match[1]
+
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append({
+                            "name": fn_name,
+                            "arguments": json.loads(args)
+                        })
+                else:
+                    function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index b403a146716d55e3157ce51053511a75bc83fa70..5501028cf36b8a1d37d82ecc92c2314409103af4 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
@@ -68,8 +69,8 @@ class Phi4MiniJsonToolParser(ToolParser):
                              len(function_call_arr))
             except json.JSONDecodeError as e:
                 logger.error(
-                    "Failed to parse function calls from model output: %s. "
-                    "Error: %s", model_output, str(e))
+                    "Failed to parse function calls from model output. "
+                    "Error: %s", str(e))
 
             tool_calls: list[ToolCall] = [
                 ToolCall(
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 548ff39d1ca4f43bca75afc13a9a32f7b8349f92..73329cdf701d6e589845997d47c76bd73496db13 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import json
@@ -8,6 +9,7 @@ from typing import Any, Union
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -61,8 +63,18 @@ class PythonicToolParser(ToolParser):
         """
         Extract the tool calls from a complete model response.
         """
-
-        if not (self.TOOL_CALL_REGEX.match(model_output)):
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index acbff3258e465c46b56284bec83eb79ed96dd25d..aa41cd6dc53ed1692487f7aded2205e3e82a6623 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from json import JSONDecodeError, JSONDecoder
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 80b6c07c603f944e8faa76128a20e76a5032b0a1..c4e044f3a28e9c34f2efa17af242a9e261191f5a 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Union
 
 from torch.nn import CosineSimilarity
diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py
index dba916b8bf13ffc1562f3fbd40e7b7aab3825654..e3646a60a7cc1bf5a6cc0c72a48104e8dc4c6d9c 100644
--- a/vllm/entrypoints/ssl.py
+++ b/vllm/entrypoints/ssl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from ssl import SSLContext
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index cc651a172b406fbb27afbb04adf2ac26f4670494..16ba2b4531acf699fec48cccb50ede5c648f0b89 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import functools
@@ -13,8 +14,9 @@ from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
-VLLM_SERVE_PARSER_EPILOG = (
-    "Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n"
+VLLM_SUBCMD_PARSER_EPILOG = (
+    "Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
+    "to explore arguments from help.\n"
     "   - To view a argument group:     --help=ModelConfig\n"
     "   - To view a single argument:    --help=max-num-seqs\n"
     "   - To search by keyword:         --help=max\n"
@@ -26,6 +28,11 @@ async def listen_for_disconnect(request: Request) -> None:
     while True:
         message = await request.receive()
         if message["type"] == "http.disconnect":
+            if request.app.state.enable_server_load_tracking:
+                # on timeout/cancellation the BackgroundTask in load_aware_call
+                # cannot decrement the server load metrics.
+                # Must be decremented by with_cancellation instead.
+                request.app.state.server_load_metrics -= 1
             break
 
 
@@ -167,8 +174,15 @@ def _validate_truncation_size(
     return truncate_prompt_tokens
 
 
-def show_filtered_argument_or_group_from_help(parser):
+def show_filtered_argument_or_group_from_help(parser, subcommand_name):
     import sys
+
+    # Only handle --help=<keyword> for the current subcommand.
+    # Since subparser_init() runs for all subcommands during CLI setup,
+    # we skip processing if the subcommand name is not in sys.argv.
+    if subcommand_name not in sys.argv:
+        return
+
     for arg in sys.argv:
         if arg.startswith('--help='):
             search_keyword = arg.split('=', 1)[1]
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 9c084aa0afeb22f2132805e4dc7c92db89e343b2..a127a5587af673d419006bdf5c9b305ba145b32c 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -1,19 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 # set some common config/environment variables that should be set
 # for all processes created by vllm and all processes
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-if not os.path.exists('/dev/nvidia-caps-imex-channels'):
-    # normally, we disable NCCL_CUMEM_ENABLE because it
-    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
-    # see https://github.com/NVIDIA/nccl/issues/1234
-    # for more details.
-    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+if 'NCCL_CUMEM_ENABLE' in os.environ:
+    logger.warning(
+        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
+        "This may increase memory overhead with cudagraph+allreduce: "
+        "https://github.com/NVIDIA/nccl/issues/1234",
+        os.environ['NCCL_CUMEM_ENABLE'])
+elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # NCCL requires NCCL_CUMEM_ENABLE to work with
     # multi-node NVLink, typically on GB200-NVL72 systems.
     # The ultimate way to detect multi-node NVLink is to use
     # NVML APIs, which are too expensive to call here.
diff --git a/vllm/envs.py b/vllm/envs.py
index 9c64a6ee962073673c08e14b225dc8991a786b2e..07bfbcf185bd024455a00fd9d45c497ca6bdf923 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
 import os
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -42,6 +44,7 @@ if TYPE_CHECKING:
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
+    VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
     VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
@@ -50,6 +53,7 @@ if TYPE_CHECKING:
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
+    VLLM_XLA_USE_SPMD: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -68,6 +72,7 @@ if TYPE_CHECKING:
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
+    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@@ -107,6 +112,7 @@ if TYPE_CHECKING:
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -118,6 +124,9 @@ if TYPE_CHECKING:
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
     VLLM_ALL2ALL_BACKEND: str = "naive"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
+    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
+    VLLM_SLEEP_WHEN_IDLE: bool = False
+    VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
 
 
 def get_default_cache_root():
@@ -142,10 +151,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
 def get_vllm_port() -> Optional[int]:
     """Get the port from VLLM_PORT environment variable.
-    
+
     Returns:
         The port number as an integer if VLLM_PORT is set, None otherwise.
-        
+
     Raises:
         ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
     """
@@ -158,17 +167,13 @@ def get_vllm_port() -> Optional[int]:
         return int(port)
     except ValueError as err:
         from urllib.parse import urlparse
-        try:
-            parsed = urlparse(port)
-            if parsed.scheme:
-                raise ValueError(
-                    f"VLLM_PORT '{port}' appears to be a URI. "
-                    "This may be caused by a Kubernetes service discovery issue"
-                    "check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html"
-                )
-        except Exception:
-            pass
-
+        parsed = urlparse(port)
+        if parsed.scheme:
+            raise ValueError(
+                f"VLLM_PORT '{port}' appears to be a URI. "
+                "This may be caused by a Kubernetes service discovery issue,"
+                "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
+            ) from None
         raise ValueError(
             f"VLLM_PORT '{port}' must be a valid integer") from err
 
@@ -290,6 +295,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in
              ("true", "1")),
 
+    # Use separate prefill and decode kernels for V1 attention instead of
+    # the unified triton kernel.
+    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
+    lambda:
+    (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
+     ("true", "1")),
+
     # Force vllm to use a specific flash-attention version (2 or 3), only valid
     # when using the flash-attention backend.
     "VLLM_FLASH_ATTN_VERSION":
@@ -300,9 +312,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(
         os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
 
-    # Internal flag to enable/disable Inductor standalone compile
-    "VLLM_TEST_STANDALONE_COMPILE":
-    lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0",
+    # Feature flag to enable/disable Inductor standalone compile.
+    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
+    # enabled by default.
+    "VLLM_USE_STANDALONE_COMPILE":
+    lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1",
 
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
@@ -323,8 +337,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # Whether to log responses from API Server for debugging
     "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
-    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False").
-    lower() == "true",
+    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"
+                           ).lower() == "true",
 
     # S3 access information, used for tensorizer to load model from S3
     "S3_ACCESS_KEY_ID":
@@ -409,7 +423,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
     # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
     "VLLM_CPU_OMP_THREADS_BIND":
-    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
+
+    # (CPU backend only) CPU cores not used by OMP threads .
+    # Those CPU cores will not be used by OMP threads of a rank.
+    "VLLM_CPU_NUM_OF_RESERVED_CPU":
+    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
 
     # (CPU backend only) whether to use prepack for MoE layer. This will be
     # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
@@ -506,6 +525,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # If set, assert on XLA recompilation after each execution step.
     "VLLM_XLA_CHECK_RECOMPILATION":
     lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
+
+    # Enable SPMD mode for TPU backend.
+    "VLLM_XLA_USE_SPMD":
+    lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
 
@@ -541,6 +564,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_RPC_TIMEOUT":
     lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
 
+    # Timeout in seconds for keeping HTTP connections alive in API server
+    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
+    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
+
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded
@@ -746,6 +773,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_DP_MASTER_PORT":
     lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
 
+    # Randomize inputs during dummy runs when using Data Parallel
+    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
+    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
+
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
@@ -813,6 +844,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Available options:
     # - "naive": naive all2all implementation using all-reduce
     # - "pplx": use pplx kernels
+    # - "deepep_high_throughput", use deepep high-throughput kernels
+    # - "deepep_low_latency", use deepep low-latency kernels
     "VLLM_ALL2ALL_BACKEND":
     lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
 
@@ -822,6 +855,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # This is used to prevent the kernel from running out of memory.
     "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
     lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
+
+    # Regex timeout for use by the vLLM tool parsing plugins.
+    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
+    lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
+
+    # Reduce CPU usage when vLLM is idle. Enabling this will incur small
+    # latency penalty when a request eventually comes.
+    "VLLM_SLEEP_WHEN_IDLE":
+    lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
+
+    # Control the max chunk bytes (in MB) for the rpc message queue.
+    # Object larger than this threshold will be broadcast to worker
+    # processes via zmq.
+    "VLLM_MQ_MAX_CHUNK_BYTES_MB":
+    lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
 }
 
 # --8<-- [end:env-vars-definition]
@@ -884,7 +932,7 @@ def compute_hash() -> str:
         "VLLM_USE_TRITON_AWQ",
         "VLLM_DP_RANK",
         "VLLM_DP_SIZE",
-        "VLLM_TEST_STANDALONE_COMPILE",
+        "VLLM_USE_STANDALONE_COMPILE",
     ]
     for key in environment_variables_to_hash:
         if key in environment_variables:
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 40ca1d29939af8d75ad205ccf94fc7b273683297..99e12201c96afffb2625d7584cfb41c59f7e940e 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import time
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index d1f8c36fbbec7638f89c0e828cbc42e3a66bee02..4e8c6d79095f991e4fe89f12609bf986a0524edd 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index e680d53cbd10e216407f0850a84d2f0f688967d9..852c8f5cffa0c5408f1a5a5f7f984b28be47e368 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from typing import Any, Type
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 380b672c3605aadbd74b62eff326fbd2e5448d46..a6c172beff7bbcc90d3d223a72764783339cdc78 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 8e67c7a41bb19f285362f1e7465d4b557d18c802..bdc2b1f4c27cd8a2bd97ec0ac75f4c961edd6637 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7bc98a16f041d5ed06f6cb3724edbb1d7d9b6a72..c222f1609096c81a4d24715faf68c38440543f69 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 1d3a6e443a80eaa9b5b616f2f4c069aed12b9d57..7ebeb4a22556f6d987093e9edd407966d0e22be0 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 3c8083e3dd0dd02aaba755ec1785b81e56dfcc94..f3b0518a44e033a6dcdd60b8b5cfa3fb541c6c69 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import defaultdict
@@ -10,7 +11,7 @@ import torch
 import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -30,6 +31,53 @@ class DPMetadata:
     max_tokens_across_dp_cpu: torch.Tensor
     cu_tokens_across_dp_cpu: torch.Tensor
 
+    @staticmethod
+    def num_tokens_across_dp(num_tokens: int, dp_size: int,
+                             dp_rank: int) -> torch.Tensor:
+        """
+        Gather the num_tokens across all DP ranks and return results in a
+        CPU tensor of size dp_size.
+        """
+        num_tokens_across_dp = [0] * dp_size
+        num_tokens_across_dp[dp_rank] = num_tokens
+        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
+                                         device="cpu",
+                                         dtype=torch.int32)
+        from vllm.distributed.parallel_state import get_dp_group
+        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
+        return num_tokens_tensor
+
+    @staticmethod
+    def make(
+            parallel_config: ParallelConfig,
+            attn_metadata: Any,
+            num_tokens: int,
+            num_tokens_across_dp: Optional[torch.Tensor] = None
+    ) -> "DPMetadata":
+
+        assert parallel_config.data_parallel_size > 1
+        dp_size = parallel_config.data_parallel_size
+        dp_rank = parallel_config.data_parallel_rank
+        if attn_metadata is not None and hasattr(attn_metadata,
+                                                 "num_prefill_tokens"):
+            # for v0 attention backends
+            batchsize = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            # for v1 attention backends or no attn_metadata
+            batchsize = num_tokens
+
+        # If num_tokens_across_dp is None, it will be computed by all_reduce
+        # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
+        assert (num_tokens_across_dp is None
+                or num_tokens_across_dp[dp_rank] == batchsize)
+        if num_tokens_across_dp is None:
+            num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+                batchsize, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
+        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
+        return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
+
 
 @dataclass
 class ForwardContext:
@@ -63,7 +111,8 @@ def get_forward_context() -> ForwardContext:
 def set_forward_context(attn_metadata: Any,
                         vllm_config: VllmConfig,
                         virtual_engine: int = 0,
-                        num_tokens: int = 0):
+                        num_tokens: Optional[int] = None,
+                        num_tokens_across_dp: Optional[torch.Tensor] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -73,28 +122,11 @@ def set_forward_context(attn_metadata: Any,
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
     dp_metadata: Optional[DPMetadata] = None
-    if vllm_config.parallel_config.data_parallel_size > 1:
-        dp_size = vllm_config.parallel_config.data_parallel_size
-        dp_rank = vllm_config.parallel_config.data_parallel_rank
-        if attn_metadata is not None and hasattr(attn_metadata,
-                                                 "num_prefill_tokens"):
-            # for v0 attention backends
-            batchsize = attn_metadata.num_prefill_tokens + \
-                attn_metadata.num_decode_tokens
-        else:
-            # for v1 attention backends or no attn_metadata
-            batchsize = num_tokens
-        num_tokens_across_dp = [0] * dp_size
-        num_tokens_across_dp[dp_rank] = batchsize
-        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
-                                         device="cpu",
-                                         dtype=torch.int32)
-        from vllm.distributed.parallel_state import get_dp_group
-        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        max_tokens_across_dp_cpu = torch.max(num_tokens_tensor)
-        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
-        dp_metadata = DPMetadata(max_tokens_across_dp_cpu,
-                                 cu_tokens_across_dp_cpu)
+    if vllm_config.parallel_config.data_parallel_size > 1 and (
+            attn_metadata is not None or num_tokens is not None):
+        dp_metadata = DPMetadata.make(vllm_config.parallel_config,
+                                      attn_metadata, num_tokens or 0,
+                                      num_tokens_across_dp)
 
     global _forward_context
     prev_context = _forward_context
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index df4f844cd815e4e1135c653fceab35613be488e7..37bf2b7a44366c244a63c7f6735e367a761eff3f 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 843c45bd6163e5bbb1b01b72e17ab5ae20b64305..23cb5e5022f194eac4933d28d9ccc0ff51435eae 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
 
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 4c64a41ace310c4690968ed02944d55239f8db65..8c3700799e4ab9160057bfee164acff4b0b57e41 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Literal, Optional, TypedDict, Union, cast, overload
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b9acabeabd8df46bd59cf932864040d163b24e9d..a13e563f34a14e6aa9110483e7893d1f15eda876 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from collections.abc import Mapping
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f424a8f613ab1234e331638a4831d6b15c5c7991..3dad021e316687d043e0f84bd2fdebf530f9d209 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
+import torch
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar
 
+from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import resolve_mm_processor_kwargs
@@ -20,6 +24,8 @@ _T = TypeVar("_T")
 _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 
+logger = init_logger(__name__)
+
 
 @dataclass(frozen=True)
 class InputContext:
@@ -133,7 +139,7 @@ class InputProcessingContext(InputContext):
         hf_processor: ProcessorMixin,
         data: Mapping[str, object],
         kwargs: Mapping[str, object] = {},
-    ) -> BatchFeature:
+    ) -> Union[BatchFeature, JSONTree]:
         """
         Call `hf_processor` on the prompt `data`
         (text, image, audio...) with configurable options `kwargs`.
@@ -153,8 +159,25 @@ class InputProcessingContext(InputContext):
             allow_var_kwargs=True,
         )
 
+        def maybe_cast_dtype(x):
+            # This mimics the behavior of transformers.BatchFeature
+            if isinstance(x, torch.Tensor) and x.is_floating_point():
+                return x.to(dtype=self.model_config.dtype)
+            return x
+
         try:
-            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            # this emulates output.to(dtype=self.model_config.dtype)
+            cast_output = json_map_leaves(maybe_cast_dtype, output)
+            if isinstance(output, BatchFeature):
+                return BatchFeature(cast_output)
+
+            logger.warning_once(
+                f"{type(hf_processor).__name__} did not return `BatchFeature`. "
+                "Make sure to match the behaviour of `ProcessorMixin` when "
+                "implementing custom processors.")
+            return cast_output
+
         except Exception as exc:
             msg = (f"Failed to apply {type(hf_processor).__name__} "
                    f"on data={data} with kwargs={merged_kwargs}")
diff --git a/vllm/jsontree.py b/vllm/jsontree.py
index 91cd7cb216d7755a11cf5dcf9aab173f8c3dbfd5..4cbe0f76e0067797a2d72e2a5adb9d835f35b8bf 100644
--- a/vllm/jsontree.py
+++ b/vllm/jsontree.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
 from collections.abc import Iterable
 from functools import reduce
diff --git a/vllm/logger.py b/vllm/logger.py
index fd16dd95bb1b302ef671bba3e017924b5ce0aedf..0ddb83cb8ba7a4bdecc875a2733312b870043ada 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Logging configuration for vLLM."""
 import datetime
 import json
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 7ab4632589bf499cf967a02794e0381413fb946e..cf690a89ae9bcf1ae263020ed867323cfcd424ba 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logging_utils.formatter import NewLineFormatter
 
diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py
index 47ce0ab188bd64f5cf79705a1ac6fc9faa462c63..d14515f56e54c70840edf4865d944674c7a1b3c2 100644
--- a/vllm/logging_utils/dump_input.py
+++ b/vllm/logging_utils/dump_input.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import enum
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 010b0a124987b5ff75820d3ff5067ea3803cf3cc..0affef10078dcb6c9ef74926ca9a72f5cfe8ba60 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 29a73656bf65e930cf815e98c3b2e63657e58bc4..5967d0836bd45da506cff20581fe99960242e40a 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Union
 
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index b6b138a44051f506e70ed7da12ea9795a11d98b2..7fc4cfe026aee112a37248568ff058c8d3ac1193 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # pylint: disable=unused-argument
 from typing import TYPE_CHECKING, Optional, Union, cast
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 023c8e9c9a8646ba1e0c21ac1c50e3347bbb8213..66e037a97d063552aa61680ff87dc37309054840 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # pylint: disable=unused-argument
 import math
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 294b49e0a8997f209a9e399d4457added21403e7..958364fca592f7d55a5f4168f3284a3945f91c46 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence as GenericSequence
 from typing import Optional
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index af5cebdf2a8bbd4c4c4e23fd460f4bd971e410ce..262e6799583ae672d28bb9ee7f46b58330eedbf5 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import copy
 import math
 import os
 from collections.abc import Sequence
@@ -34,6 +34,7 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
+from vllm.model_executor.utils import get_packed_modules_mapping
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -200,7 +201,7 @@ class LoRAModel(AdapterModel):
             weights_mapper: Optional[WeightsMapper] = None,
             tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
-        
+
         Args:
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
@@ -364,8 +365,8 @@ class LoRAModelManager(AdapterModelManager):
             # We need to replace rotary emb layer to do batch computation
             # for long lora.
             self.supported_lora_modules.append("rotary_emb")
-        self.packed_modules_mapping = copy.deepcopy(
-            self.model.packed_modules_mapping)
+
+        self.packed_modules_mapping = get_packed_modules_mapping(self.model)
         # Used to indicate whether the model is a multimodal model
         self.supports_mm: bool = (
             supports_multimodal(self.model)
@@ -620,7 +621,7 @@ class LoRAModelManager(AdapterModelManager):
     def _filter_unsupported_mm_module(self, module_name: str) -> bool:
         """
         Regarding multimodal models, vLLM currently only supports adding LoRA to
-        language model. LoRA for other modules, such as the vision tower, will 
+        language model. LoRA for other modules, such as the vision tower, will
         be filtered out.
         """
         if self.supports_mm:
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
index 85601d58c9d73d4680806d13d0414d238454da10..22aa3c63dce193522a73d63de8857916fad86d41 100644
--- a/vllm/lora/ops/torch_ops/__init__.py
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index af79f98415cbc1d83bd0e3446f4596562f776315..cba5baad8668613df85be14af0b26e39d834dc08 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
@@ -36,10 +37,13 @@ def bgmv_expand(inputs: torch.Tensor,
     if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
         limit = 1
 
+    # LoRA adapter and model may add different amounts of padding to output
+    common_len = min(outputs.shape[1], output_tensor.shape[1])
+
     if add_inputs:
-        output_tensor[:, :outputs.shape[1]] += outputs[:limit, :]
+        output_tensor[:, :common_len] += outputs[:limit, :common_len]
     else:
-        output_tensor[:, :outputs.shape[1]] = outputs[:limit, :]
+        output_tensor[:, :common_len] = outputs[:limit, :common_len]
 
 
 def sgmv_shrink(
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 5a39705e857128966d341f4829140c2260342afe..805de4b6f657034a9029d5c052e27023b387b00f 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
index 0f971c03592d11a51bd093c084b5bb5608d113ad..e93064d0c83ad646403f60859a6de3e61cf19cb3 100644
--- a/vllm/lora/ops/triton_ops/kernel_utils.py
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Utilities for Punica kernel construction.
 """
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 9feb9e46245910db3a519d74858e0bc72c91b130..9e1f90e757cde6cce01c5a28e066d82760b58746 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index ac459a83220c71143f4ded0eb8a91c3ef888200b..39e647b9b88a4e7bca60752adf3cac83308ed3f0 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LoRA kernels metadata preparation utilities.
 """
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index c3871bd58ffa18cf6e0afd988a3923a8acfb5971..3f9edfc6d655cbafa0f8145fe0ac1a9545f11d0b 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 6225635c2955fb7c2ac2f360da4e429808ab3b31..5857f7fecb5b4a2647c8b9a56dd12c66ced31746 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py
index 94062b05d9161ec3416d418c000cd3cc947f7bc6..7e7c3c892457a9811e1dd1f3bd04e2e9c19f8682 100644
--- a/vllm/lora/ops/xla_ops/__init__.py
+++ b/vllm/lora/ops/xla_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
                                             bgmv_shrink)
diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py
index acbec0cfab9c75d6464b892bad231108e69d5e61..9118f3351ef0a55f0adcf153355f609be96d2404 100644
--- a/vllm/lora/ops/xla_ops/lora_ops.py
+++ b/vllm/lora/ops/xla_ops/lora_ops.py
@@ -1,63 +1,100 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import jax
+import jax.numpy as jnp
 import torch
+import torch.nn.functional as F
+import torch_xla.core.xla_builder as xb
+from torch.library import impl
+from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard
 
-# Required to register the custom ops
-import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
 
+@jax.jit
+def bgmv_jax(inputs, loras, idxs):
+    return jnp.einsum(
+        "td,tX,Xld->tl",
+        inputs,
+        jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype),
+        loras,
+    )
 
-def bgmv_expand(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                add_inputs: bool = True):
+
+XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor")
+
+
+@impl(XLA_LIB, "bgmv", "XLA")
+def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+
+    jax_import_guard()
+    return xb.call_jax(bgmv_jax, (inputs, loras, idxs))
+
+
+@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
+def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
+                 idxs: torch.IntTensor):
+    T, _ = inputs.shape
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+    _, L, _ = loras.shape
+
+    return torch.empty((T, L), device=inputs.device)
+
+
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
-        
-        output_tensor (torch.Tensor): output tensor of shape 
+
+        output_tensor (torch.Tensor): output tensor of shape
             [num_tokens, hidden_size * num_slices].
-        
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output 
+        add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
 
     outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-    n_tokens = outputs.size(0)
 
     limit = output_tensor.shape[0]
     if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
         limit = 1
 
-    outputs = torch.cat(
-        (outputs,
-         torch.zeros((n_tokens, output_tensor.shape[1] - outputs.shape[1]),
-                     device=outputs.device)),
-        dim=1)
+    if output_tensor.shape[1] > outputs.shape[1]:
+        outputs = F.pad(outputs,
+                        (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
 
     if add_inputs:
-        return output_tensor + outputs[:limit, :]
+        return output_tensor + outputs[:limit, :output_tensor.shape[1]]
     else:
-        return outputs[:limit, :]
+        return outputs[:limit, :output_tensor.shape[1]]
 
 
-def bgmv_shrink(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                scaling: float = 1.0):
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
         output_tensor (torch.Tensor): (Unused) output tensor (placeholder).
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
         scaling (float, optional): Scalar multiplier applied to the output.
     """
@@ -66,39 +103,41 @@ def bgmv_shrink(inputs: torch.Tensor,
                                         lora_indices_tensor)
 
 
-def bgmv_expand_slice(inputs: torch.Tensor,
-                      lora_b_weights: torch.Tensor,
-                      output_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor,
-                      slice_offset: int,
-                      slice_size: int,
-                      add_inputs: bool = True):
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
-        
-        output_tensor (torch.Tensor): output tensor of shape 
+
+        output_tensor (torch.Tensor): output tensor of shape
             [num_tokens, hidden_size * num_slices].
-        
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output 
+        add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
     outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-    n_tokens = outputs.size(0)
 
-    outputs = torch.cat((
-        torch.zeros((n_tokens, slice_offset), device=outputs.device),
+    outputs = F.pad(
         outputs,
-        torch.zeros(
-            (n_tokens, output_tensor.shape[1] - (slice_offset + slice_size)),
-            device=outputs.device),
-    ),
-                        dim=1)
+        (
+            slice_offset,
+            output_tensor.shape[1] - (slice_offset + slice_size),
+            0,
+            0,
+        ),
+    )
 
     if add_inputs:
         return output_tensor + outputs
diff --git a/vllm/lora/ops/xla_ops/pallas.py b/vllm/lora/ops/xla_ops/pallas.py
deleted file mode 100644
index 35dc307539bf45b0bdbffd2a69eca183798c074a..0000000000000000000000000000000000000000
--- a/vllm/lora/ops/xla_ops/pallas.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import functools
-
-import jax
-import jax.numpy as jnp
-import torch
-from jax.experimental import pallas as pl
-from jax.experimental.pallas import tpu as pltpu
-from torch.library import impl
-from torch_xla.experimental.custom_kernel import (XLA_LIB, jax_import_guard,
-                                                  make_kernel_from_pallas)
-
-# TODO: Tune these
-TOKENS_BLOCK = 16
-LORA_RANK_BLOCK = 128
-DIM_BLOCK_SIZE = 128
-
-
-def _bgmv_kernel(bT: int, bL: int, idx_ref, inp_ref, lora_ref, out_ref,
-                 acc_ref, mask_ref):
-
-    @pl.when(pl.program_id(2) == 0)
-    def _():
-        acc_ref[...] = jnp.zeros_like(acc_ref[...], dtype=jnp.float32)
-
-    t = pl.program_id(0)
-
-    for i in range(bT):
-        idx = idx_ref[i + bT * t]
-        mask_ref[...] = jnp.zeros_like(mask_ref[...], dtype=jnp.float32)
-        mask_ref[i, :] = jnp.ones((bL, ), dtype=jnp.float32)
-
-        acc_ref[...] += jax.lax.dot_general(
-            inp_ref[...],
-            lora_ref[idx, ...], (((1, ), (1, )), ((), ())),
-            preferred_element_type=jnp.float32) * mask_ref[...]
-
-    @pl.when(pl.program_id(2) == pl.num_programs(2) - 1)
-    def _():
-        out_ref[...] = acc_ref[...].astype(out_ref.dtype)
-
-
-@jax.jit
-def _bgmv(
-    idxs: jax.Array,  # (T, ) int32
-    inputs: jax.Array,  # (T, D) model dtype
-    loras: jax.Array  # (N, L, D) model dtype
-) -> jax.Array:  # (T, L) model dtype
-    T, D = inputs.shape
-    N, L, _ = loras.shape
-
-    return pl.pallas_call(
-        kernel=functools.partial(_bgmv_kernel, TOKENS_BLOCK, LORA_RANK_BLOCK),
-        out_shape=jax.ShapeDtypeStruct((T, L), dtype=inputs.dtype),
-        grid_spec=pltpu.PrefetchScalarGridSpec(
-            num_scalar_prefetch=1,
-            grid=(T // TOKENS_BLOCK, L // LORA_RANK_BLOCK,
-                  D // DIM_BLOCK_SIZE),
-            in_specs=[
-                pl.BlockSpec((TOKENS_BLOCK, DIM_BLOCK_SIZE),
-                             lambda i, j, k, block_idx: (i, k)),
-                pl.BlockSpec((N, LORA_RANK_BLOCK, DIM_BLOCK_SIZE),
-                             lambda i, j, k, block_idx: (0, j, k)),
-            ],
-            out_specs=pl.BlockSpec((TOKENS_BLOCK, LORA_RANK_BLOCK),
-                                   lambda i, j, k, block_idx: (i, j)),
-            scratch_shapes=[
-                pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32),
-                pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32)
-            ]),
-        compiler_params=pltpu.TPUCompilerParams(
-            dimension_semantics=("parallel", "parallel", "arbitrary")),
-        name="bgmv")(idxs, inputs, loras)
-
-
-def bgmv_shape_function(idxs, inputs, loras):
-    T, _ = inputs.shape
-    _, L, _ = loras.shape
-
-    return [((T, L), inputs.dtype)]
-
-
-XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor", )
-
-
-@impl(XLA_LIB, "bgmv", "XLA")
-def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
-    inputs = inputs.to(dtype=loras.dtype)
-
-    if len(loras.shape) == 4:
-        loras = loras.squeeze(axis=1)
-
-    jax_import_guard()
-    kernel = make_kernel_from_pallas(_bgmv, bgmv_shape_function)
-
-    T, _ = inputs.shape
-    _, L, D = loras.shape
-
-    # Pad the loras' rank if it's too low. This is to allow it to fit in a TPU
-    # register. This has to happen in pytorch, doing it in Jax will lead to NaNs
-    L1 = L
-    if LORA_RANK_BLOCK > L or L % LORA_RANK_BLOCK != 0:
-        L1 = (L // LORA_RANK_BLOCK + 1) * LORA_RANK_BLOCK
-
-    D1 = D
-    if DIM_BLOCK_SIZE > D or D % DIM_BLOCK_SIZE != 0:
-        D1 = (D // DIM_BLOCK_SIZE + 1) * DIM_BLOCK_SIZE
-
-    T1 = T
-    if TOKENS_BLOCK > T or T % TOKENS_BLOCK != 0:
-        T1 = (T // TOKENS_BLOCK + 1) * TOKENS_BLOCK
-
-    if D1 != D or L1 != L:
-        loras = torch.nn.functional.pad(loras, (0, D1 - D, 0, L1 - L, 0, 0))
-    if D1 != D or T1 != T:
-        inputs = torch.nn.functional.pad(inputs, (0, D1 - D, 0, T1 - T))
-        if T1 != T:
-            idxs = torch.nn.functional.pad(idxs, ((0, T1 - T)))
-
-    return kernel(idxs, inputs, loras)[:T, :L]
-
-
-@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
-def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
-                 idxs: torch.IntTensor):
-    T, _ = inputs.shape
-
-    if len(loras.shape) == 4:
-        loras = loras.squeeze(axis=1)
-
-    _, L, _ = loras.shape
-
-    return torch.empty((T, L), device=inputs.device)
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 7d335e5f7fab19579b44c7e4b3dd3eed5db552fc..a20d73f0f725bbd0ff4a2fb3c9b2df02d80dd5ee 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
index 915fc6623398e2aa2ff67723aa3770d35b4aa1db..e664ffa1dfe6ee60962dbe496cccf123b164eb05 100644
--- a/vllm/lora/punica_wrapper/__init__.py
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index e03f7329021b31069bf479358e1d4b5abff15c26..5b4902dcbeb359c30da0aa06c4e0954eae87d71c 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index 8118a72d696a20217de29895f109bba22a2f43f7..59049cccc8cbeb46bf1fa272bf1ce9bd0471d611 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional, Union
 
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 224640ec719254f6d00950b0e4c847359a586def..6b038309d55db4677fa28b8b8aa4918dacccedd6 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index 416c23e73bf8535782bf3ef838b06dbc19b3cb16..b20c9785a74c1ee474cc57104634832bb2804074 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union, final
 
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 922d6c06000379ebd4937c7776e95c890ff80573..c684ac77cc9ca30ee7b4eab93d9abd19a74b6a51 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index f3153c6dab03c945fe086ff529965dd58efa48b6..6b48268c5006e6d9c1bb79b1ef600e9fadc482e7 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -1,11 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
+import math
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 import torch.nn.functional as F
+import torch_xla.core.xla_model as xm
 
 from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.lora.punica_wrapper.utils import convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
 
 from .punica_base import PunicaWrapperBase
 
@@ -31,6 +40,15 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         self._sampler_indices_padded = self._sampler_indices_padded.to(
             dtype=torch.int32)
 
+        torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded,
+                                               True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._long_lora_indices, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch,
+                                               True)
+
         torch._dynamo.mark_dynamic(self._token_lora_indices, 0)
         torch._dynamo.mark_dynamic(self._embeddings_indices, 1)
         torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0)
@@ -55,15 +73,11 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
     def shrink(
         self,
-        y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
         scale: float,
     ):
-        if self.no_lora:
-            return y
-        return bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x),
-                           scale)
+        return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale)
 
     def expand(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor,
                add_inputs: bool):
@@ -72,7 +86,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
     def expand_slice(self, y: torch.Tensor, x: torch.Tensor,
                      w_t_all: torch.Tensor, y_offset: int, y_slice_size: int,
-                     y_total_size: int, add_inputs: bool) -> torch.Tensor:
+                     add_inputs: bool) -> torch.Tensor:
         return bgmv_expand_slice(x, w_t_all, y,
                                  self._get_token_lora_indices(x), y_offset,
                                  y_slice_size, add_inputs)
@@ -98,9 +112,8 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         x = x.view(-1, x.shape[-1])
 
         for slice_idx in range(len(lora_a_stacked)):
-            y_s = y[slice_idx]
             lora_s = lora_a_stacked[slice_idx]
-            y_s = self.shrink(y_s, x, lora_s, scale)
+            y_s = self.shrink(x, lora_s, scale)
             y[slice_idx, :, :] = y_s  # type: ignore[index]
         return y
 
@@ -140,15 +153,12 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             y = self._apply_bias(self._get_token_lora_indices(y), y,
                                  output_slices, lora_bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
-            y = self.expand_slice(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                y_total_size=sum(output_slices),
-                add_inputs=add_inputs,
-            )
+            y = self.expand_slice(y,
+                                  x[slice_idx],
+                                  lora_b_stacked[slice_idx],
+                                  offset_left,
+                                  output_slices[slice_idx],
+                                  add_inputs=add_inputs)
             offset_left += output_slices[slice_idx]
         return y.view_as(y_org)
 
@@ -216,12 +226,10 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
-            # We set the buffer to be float32 by default, consistent with the
-            # triton op
             T = x.size(0)
             buffer = torch.zeros(
                 (len(output_slices), T, r),
-                dtype=torch.float32,
+                dtype=x.dtype,
                 device=x.device,
             )
         buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
@@ -257,26 +265,16 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             scale (float): Scaling factor.
             buffer (Optional[torch.Tensor]):Default to None.
         """
-        if self.no_lora:
-            return y
-
         y_org = y
         y = y.view(-1, y.shape[-1])
         x = x.view(-1, x.shape[-1])
-        r = lora_b_stacked.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default, consistent with the
-            # triton op
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-
-        buffer = bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices,
-                             scale)
+
+        sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
+        buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale)
         y = bgmv_expand(buffer,
                         lora_b_stacked,
                         y,
-                        self.sampler_indices,
+                        sampler_indices,
                         add_inputs=True)
         return y.view_as(y_org)
 
@@ -316,10 +314,92 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
         return output.view_as(org_output)
 
+    # This performs the same tensor ops as the base method, except it does them
+    # on the CPU then transfers the results to the TPU
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: list[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        # Make sure we don't accidentally collect outside operations
+        xm.mark_step()
+
+        # Pad the prompt mapping to avoid running into recompiles on the TPU
+        # TODO: Should this happen inside mapping internally? If so how can we
+        # avoid having backend specific LoRAMapping classes?
+        mapping.prompt_mapping = self._pad_prompt_mapping(
+            mapping.prompt_mapping)
+
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            "cpu",
+            long_lora_context,
+        )
+        self._token_lora_indices = self._pad_to_shape(
+            base_indices, self._token_lora_indices.shape,
+            dims=1).to(self.device)
+        self._sampler_indices = self._pad_to_shape(sampler_indices,
+                                                   self._sampler_indices.shape,
+                                                   dims=1).to(self.device)
+        self._sampler_indices_padded = self._pad_to_shape(
+            sampler_indices_padded, self._sampler_indices_padded.shape,
+            dims=1).to(self.device)
+        self._embeddings_indices = self._pad_to_shape(
+            embeddings_indices, self._embeddings_indices.shape,
+            dims=2).to(self.device)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices = self._pad_to_shape(
+                long_lora_offsets_tensor,
+                self._long_lora_indices.shape,
+                dims=1).to(self.device)
+        else:
+            zeroed = torch.zeros_like(self._long_lora_indices.cpu(),
+                                      dtype=torch.int32)
+            self._long_lora_indices = zeroed.to(self.device)
+        self.indices_len[:] = indices_len
+
     def _update_prefill_metadata(self,
                                  token_lora_tensor: torch.Tensor) -> None:
         self.batch_size = 1
-        self._lora_indices_per_batch[:self.batch_size].copy_(
-            token_lora_tensor[:self.batch_size])
-        # TODO: .item() is extremely inefficient on TPU, so find a way around it
-        self.no_lora = torch.all(token_lora_tensor == -1).item()
+        self._lora_indices_per_batch[:self.
+                                     batch_size] = token_lora_tensor[:self.
+                                                                     batch_size]
+
+    def _pad_prompt_mapping(
+            self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]:
+        num_reqs = len(prompt_mapping)
+
+        # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular
+        # import
+        MIN_NUM_SEQS = 8
+
+        padded_num_reqs = max(2**math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS)
+        pad_len = padded_num_reqs - num_reqs
+
+        padding = [-1] * pad_len
+        return tuple(list(prompt_mapping) + padding)
+
+    def _pad_to_shape(self, src, target_shape, dims=1):
+        if dims == 1:
+            pad_len = target_shape[0] - src.shape[0]
+            return F.pad(src, (0, pad_len), value=0).to(torch.int32)
+        else:
+            pad_rows = target_shape[0] - src.shape[0]
+            pad_cols = target_shape[1] - src.shape[1]
+            return F.pad(src, (0, pad_cols, 0, pad_rows),
+                         value=0).to(torch.int32)
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index 1adb40b4c284be994e2954a61f41fad22dd1231c..0b0a7989f390747ca8823451162fbf16148398c3 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union
 
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 616e94f8d678f47e453f5590af0a9899ff738409..5bbba7830c1b1081f5d6bba64a8abe7028a2f1b5 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
index 33f35322fe85fb628d75beb2597d44c5144909b2..5808ae105e86459eaf7c327f200d0ec2153998e9 100644
--- a/vllm/lora/resolver.py
+++ b/vllm/lora/resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections.abc import Set
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 619dd3bdc40afa7c99c753cefb37cea4885e7c41..ee196e3f689a2151c692572d9fde7ee44d6bae04 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index afc8a8dc3b26012b662eadb8c8f953307274cf2b..7da44569f40862df5360481bd4346c45450c1b5f 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Any, Literal, Optional, Union
@@ -229,6 +230,11 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
             self.add_adapter(lora)
 
     def add_adapter(self, lora_request: LoRARequest) -> bool:
+        # Note that this method is not thread-safe. It may be invoked multiple
+        # times for the same adapter when using multiple API servers.
+        # This is ok because it's currently only called from
+        # the single-threaded core engine loop.
+
         if lora_request.lora_int_id not in self.list_adapters():
             # Load the new adapter first to ensure it is actually valid, before
             # evicting any existing adapters.
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 7636152176f13962a68576a432582e261d5c1fca..55dfe8088c8f3e84c9d5f776db8ad525b47718b7 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            PackedvLLMParameter)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index acf7224675e4f29a5de77a7e7521a2dce81ac457..7e6cdd98751066cc571d2c315d266195ae1d3281 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch.nn as nn
 
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index a2b61a1b19e4d351d3840dbc24fe90554b85faf1..3c2998bece441b0a05037d466d74fc19d38c39c4 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index 58adcc3caff99ef84842f23fbf1b19e8cfcbb51b..05b6a1c3239f1ee6aae16726850cdb6e223eb79b 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 
 import llguidance
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
index e17df68b4b4da5a4b73c648452f3eb2f8c24a0c8..379b5eaa38a76dc32732953484e8d55f55c7c89e 100644
--- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import os
 from typing import Any
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index 085f37a5d51670deeb27c1095a859e0fa377dfcf..fa97b6dbf51159c06376febf16c0b1c121eafa1f 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -1,14 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional, TypedDict, Union
 
-from pydantic import BaseModel
-
 
 # These classes are deprecated, see SamplingParams
 class LLMGuidedOptions(TypedDict, total=False):
-    guided_json: Union[dict, BaseModel, str]
+    guided_json: Union[dict, str]
     guided_regex: str
     guided_choice: list[str]
     guided_grammar: str
@@ -20,7 +19,7 @@ class LLMGuidedOptions(TypedDict, total=False):
 @dataclass
 class GuidedDecodingRequest:
     """One of the fields will be used to retrieve the logit processor."""
-    guided_json: Optional[Union[dict, BaseModel, str]] = None
+    guided_json: Optional[Union[dict, str]] = None
     guided_regex: Optional[str] = None
     guided_choice: Optional[list[str]] = None
     guided_grammar: Optional[str] = None
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index 7eaf9e38e66a33ab3fc450b5053366dc5860de63..f9b51f4c157453afd7e6201e02fa6976d5294d0e 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from json import loads as json_loads
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index e41af4b360e45b5b62599e62face7a870c61880c..26c2d958e7511a37b08507609b9fc08c57c23f27 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import concurrent.futures
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 6986b6554c23046c9d0509894c57078769dc5f37..4ef4db7c4a399088d2c4f2c10f3fd5f5adeccc9e 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024- the Outlines developers
 # This file is adapted from
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 3f77cf394d9a36459ddfb83c8f036b14ca3f981d..8fdfa983e120b98f47a7a5b3354613dca12c9907 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import regex as re
 
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index d2e568609945949887af18e0a37741502aa7f2e0..bdd3a1a9c0a59cd348305945ad8b7b835bb810f6 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # noqa: UP007
 from __future__ import annotations
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index a32c26317a8840b81edc513d3915c7820c7f56cc..cc9c8d445ab6c9021721d086b50ea82273311caf 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom activation functions."""
 import math
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 5c262287f7dd4d92263bd98350f9e98f5a0a753f..2bdc96e297c1f2a06da5e4980bcadb6f0e6a714e 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Any, Optional
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..76d71ca08856c9f20a90b8c00511357812838580
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
+
+logger = init_logger(__name__)
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    # The Deep Gemm kernels only support block size of 128
+    DEEPGEMM_BLOCK_SHAPE = 128
+
+    def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
+                 block_shape: list[int]):
+        """
+        max_num_tokens: Maximum number of tokens from a DP Rank
+        world_size: Number of EP ranks
+        dp_size: Number of data-parallel ranks
+        block_shape: Block quantization block shape
+        """
+        super().__init__()
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.block_shape = block_shape
+
+        assert (len(self.block_shape) == 2 and all(
+            [v == self.DEEPGEMM_BLOCK_SHAPE for v in self.block_shape]))
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        assert a.dim() == 2
+        num_dp = self.world_size // self.dp_size
+        max_num_tokens = a.size(
+            0) if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = num_experts * max_num_tokens * num_dp * max(K, N)
+        workspace2 = num_experts * max_num_tokens * num_dp * (N // 2)
+        return (workspace13, workspace2, a.dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        import deep_gemm as dg
+        assert hidden_states.ndim == 3
+
+        a1q = hidden_states
+        _, N, K = w1.size()
+
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
+        assert w2.size(1) == K
+
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
+            hidden_states, w1, w2, topk_ids)
+
+        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
+        workspace2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2))
+        workspace3 = _resize_cache(workspace13, (E, max_num_tokens, K))
+
+        # (from deepgemm docs) : A value hint (which is a value on CPU)
+        # for the M expectation of each batch, correctly setting this value
+        # may lead to better performance.
+        expected_m = max_num_tokens
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a1q, a1q_scale),
+                                                 (w1, w1_scale),
+                                                 out=workspace1,
+                                                 masked_m=expert_num_tokens,
+                                                 expected_m=expected_m)
+
+        # TODO (varun) [Optimization]: Use a batched version of activation.
+        # Similarly for the quant below.
+        self.activation(activation, workspace2, workspace1.view(-1, N))
+
+        w2_hidden_size = workspace2.size(-1)
+        workspace2 = workspace2.view(-1, w2_hidden_size)
+
+        a2q_scale: Optional[torch.Tensor] = None
+        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+                                                   self.block_shape[1],
+                                                   column_major_scales=False)
+        a2q = a2q.view(E, max_num_tokens, -1)
+        a2q_scale = a2q_scale.view(E, max_num_tokens, -1)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a2q, a2q_scale),
+                                                 (w2, w2_scale),
+                                                 out=workspace3,
+                                                 masked_m=expert_num_tokens,
+                                                 expected_m=expected_m)
+
+        return workspace3
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d62d519af8d7b94c1fdca6bdfa3c12bc62235860
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+
+
+class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(self,
+                 max_num_tokens: int,
+                 world_size: int,
+                 dp_size: int,
+                 use_fp8_w8a8: bool = False,
+                 use_int8_w8a8: bool = False,
+                 use_int8_w8a16: bool = False,
+                 use_int4_w4a16: bool = False,
+                 per_channel_quant: bool = False,
+                 block_shape: Optional[list[int]] = None,
+                 allow_deep_gemm: bool = False):
+        super().__init__()
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
+
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.use_fp8_w8a8 = use_fp8_w8a8
+        self.use_int8_w8a8 = use_int8_w8a8
+        self.use_int8_w8a16 = use_int8_w8a16
+        self.use_int4_w4a16 = use_int4_w4a16
+        self.per_channel_quant = per_channel_quant
+        self.block_shape = block_shape
+        self.allow_deep_gemm = allow_deep_gemm
+
+        # BatchedTritonKernel doesn't support block quantization
+        # at the moment.
+        self.batched_triton_experts = BatchedTritonExperts(
+            max_num_tokens=self.max_num_tokens,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a8=self.use_int8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            per_channel_quant=self.per_channel_quant,
+            block_shape=self.block_shape,
+            world_size=self.world_size,
+            dp_size=self.dp_size) if self.block_shape is None else None
+
+        is_fp8_128_block_quantized = (self.use_fp8_w8a8
+                                      and self.block_shape is not None
+                                      and len(self.block_shape) == 2 and all(
+                                          [b == 128
+                                           for b in self.block_shape]))
+        self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
+            max_num_tokens=self.max_num_tokens,
+            world_size=self.world_size,
+            dp_size=self.dp_size,
+            block_shape=self.block_shape,  # type: ignore[arg-type]
+        ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
+            return self.batched_deep_gemm_experts.workspace_shapes(
+                a, aq, M, N, K, topk, num_experts)
+        else:
+            assert self.batched_triton_experts is not None
+            return self.batched_triton_experts.workspace_shapes(
+                a, aq, M, N, K, topk, num_experts)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        use_batched_deep_gemm_experts = (self.allow_deep_gemm
+                                         and self.batched_deep_gemm_experts
+                                         is not None)
+        experts = (self.batched_deep_gemm_experts
+                   if use_batched_deep_gemm_experts else
+                   self.batched_triton_experts)
+        assert experts is not None
+        return experts.apply(hidden_states, w1, w2, topk_ids, activation,
+                             global_num_experts, expert_map, w1_scale,
+                             w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale,
+                             workspace13, workspace2, expert_num_tokens)
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000000000000000000000000000000000000..b50682075949667ee2ec58e55c139655f84bb542
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0139b9f2af405e3a91209ae366886a2e9755445
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..9264ca17fdcf09f300976332fb39935e14c18156
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 26a433da2189ac915b07ffe211dc2d2d71c79c97..6e7b1a4f2b6c901722f06626b36d1aad680cf62d 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ CUTLASS based Fused MoE kernels."""
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 
@@ -12,110 +13,109 @@ from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache
 from vllm.scalar_type import scalar_types
 
 
-class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
-
-    def __init__(
-        self,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
-        out_dtype: torch.dtype,
-    ):
-        super().__init__()
-        self.ab_strides1 = ab_strides1
-        self.c_strides1 = c_strides1
-        self.ab_strides2 = ab_strides2
-        self.c_strides2 = c_strides2
-        self.out_dtype = out_dtype
-
-    def workspace_shapes(
-        self,
-        a: torch.Tensor,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        num_experts: int,
-    ) -> tuple[int, int, torch.dtype]:
-        # Note that K, N are transposed
-        N, K = K, N
-        workspace1 = M * topk * max(2 * N, K)
-        workspace2 = M * topk * N
-        return (workspace1, workspace2, self.out_dtype)
-
-    def apply(
-        self,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        w1_scale: Optional[torch.Tensor],
-        w2_scale: Optional[torch.Tensor],
-        w1_zp: Optional[torch.Tensor],
-        w2_zp: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_num_tokens: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        a1q = hidden_states
-
-        assert w1_scale is not None
-        assert w2_scale is not None
-        assert w1.dtype == torch.float8_e4m3fn
-        assert w2.dtype == torch.float8_e4m3fn
-        assert a1q.shape[1] == w1.shape[1], "Hidden size mismatch w1"
-        assert w1.shape[2] == w2.shape[1] * 2, "Hidden size mismatch w2"
-        assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
-        assert a1q_scale is None or a1q_scale.dim(
-        ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[
-            0], "Input scale shape mismatch"
-        assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
-            1] == w1.shape[2], "W1 scale shape mismatch"
-        assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
-            1] == w2.shape[2], "W2 scale shape mismatch"
-        assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch"
-        assert w1.shape[0] == w1_scale.shape[
-            0], "w1 scales expert number mismatch"
-        assert w1.shape[0] == w2_scale.shape[
-            0], "w2 scales expert number mismatch"
-        assert a2_scale is None or a1q_scale is None or a2_scale.shape == a1q_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-        assert self.ab_strides1.shape[0] == w1.shape[
-            0], "AB Strides 1 expert number mismatch"
-        assert self.c_strides1.shape[0] == w1.shape[
-            0], "C Strides 1 expert number mismatch"
-        assert self.ab_strides2.shape[0] == w2.shape[
-            0], "AB Strides 2 expert number  mismatch"
-        assert self.c_strides2.shape[0] == w2.shape[
-            0], "C Strides 2 expert number mismatch"
-        assert self.out_dtype in [torch.half,
-                                  torch.bfloat16], "Invalid output dtype"
-
-        M = a1q.shape[0]
-        _, N, K = w2.shape  # because w1 + w2 are transposed
-        device = a1q.device
-
-        assert w1.shape[1] == K
-        assert global_num_experts != -1
-        assert a1q_scale is not None
-
-        if expert_map is not None:
-            "Translate info from expert_map to topk_ids"
-            local_topk_ids = torch.where(expert_map[topk_ids] != -1,
-                                         expert_map[topk_ids], -1)
-        else:
-            local_topk_ids = topk_ids
+def run_cutlass_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation_callable: Callable,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    a1q_scale: Optional[torch.Tensor],
+    a2_scale: Optional[torch.Tensor],
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    expert_num_tokens: Optional[torch.Tensor],
+    out_dtype: torch.dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+) -> torch.Tensor:
+    a1q = hidden_states
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    if expert_num_tokens is None:
+        assert a1q.shape[1] == w1.shape[2], "Hidden size mismatch w1"
+    else:
+        assert a1q.shape[2] == w1.shape[2], "Hidden size mismatch w1"
+    assert w1.shape[1] == w2.shape[2] * 2, "Hidden size mismatch w2"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1.shape[1], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2.shape[1], "W2 scale shape mismatch"
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert a1q_scale is None or a1q_scale.dim(
+    ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[
+        0], "Input scale shape mismatch"
+    assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a2_scale.dim(
+    ) == 0 or a2_scale.shape[0] == 1 or a2_scale.shape[0] == a1q.shape[
+        0], "Intermediate scale shape mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+    if expert_map is not None:
+        assert expert_num_tokens is None
+
+    # We have two modes: PPLX and non-PPLX. We differentiate them by checking
+    # if expert_num_tokens is None (expert_num_tokens is a tensor which PPLX
+    # uses to track the number of tokens per expert).
+    # In the non-PPLX mode, the input tokens are not padded: thus, the shape
+    # of the input is [total_num_tokens, hidden_size]. The input and output
+    # require shuffling by a_map and c_map such that the tokens assigned to
+    # each expert are contiguous.
+    # In the PPLX mode, the input tokens are padded per expert to ensure that
+    # the PPLX dispatch and combine functions work correctly: thus, the shape
+    # of the input is [num_experts, max_num_tokens_per_expert, hidden_size].
+    # The PPLX input and output require no shuffling by a_map and c_map since
+    # their tokens are already contiguous for each expert as a result of
+    # the dispatch function.
+    is_pplx = expert_num_tokens is not None
+
+    M = a1q.shape[0]  # no pplx
+    padded_M = a1q.shape[1]  # pplx
+    _, K, N = w2.shape
+    device = a1q.device
+
+    assert w1.shape[2] == K
+    assert global_num_experts != -1
+    assert a1q_scale is not None
+
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids] != -1,
+                                     expert_map[topk_ids], -1)
+    else:
+        local_topk_ids = topk_ids
+
+    topk = local_topk_ids.shape[1]
+    local_E = w1.shape[0]
+
+    if is_pplx:
+        expert_offsets = torch.empty((local_E),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes1 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes2 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
 
-        topk = local_topk_ids.shape[1]
+        ops.get_cutlass_pplx_moe_mm_data(expert_offsets, problem_sizes1,
+                                         problem_sizes2, expert_num_tokens,
+                                         local_E, padded_M, N, K)
 
-        per_act_token = a1q_scale.numel() != 1 if a1q_scale is not None else (
-            a2_scale.numel() != 1 if a2_scale is not None else False)
+        w1_scale = w1_scale.reshape(w1_scale.shape[0], -1)
+        w2_scale = w2_scale.reshape(w2_scale.shape[0], -1)
+        a1q = a1q.reshape(-1, a1q.shape[2])
+        a1q_scale = a1q_scale.reshape(-1, a1q_scale.shape[2]).contiguous()
 
+    else:
         expert_offsets = torch.empty((global_num_experts + 1),
                                      dtype=torch.int32,
                                      device=device)
@@ -148,50 +148,130 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
 
         a1q = _fp8_perm(a1q, a_map)
         a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale
-
+        expert_offsets = expert_offsets[:-1]
+
+    ab_strides1 = torch.full((w1.shape[0], ),
+                             K,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides1 = torch.full((w1.shape[0], ),
+                            2 * N,
+                            device=device,
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((w1.shape[0], ),
+                             N,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides2 = torch.full((w1.shape[0], ),
+                            K,
+                            device=device,
+                            dtype=torch.int64)
+
+    if is_pplx:
+        c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2))
+        c2 = _resize_cache(workspace2, (local_E * padded_M, N))
+        c3 = _resize_cache(workspace13, (local_E * padded_M, K))
+    else:
         c1 = _resize_cache(workspace13, (M * topk, N * 2))
         c2 = _resize_cache(workspace2, (M * topk, N))
         c3 = _resize_cache(workspace13, (M * topk, K))
 
-        ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale,
-                           expert_offsets[:-1], problem_sizes1,
-                           self.ab_strides1, self.ab_strides1, self.c_strides1)
+    ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
+                       problem_sizes1, ab_strides1, ab_strides1, c_strides1,
+                       per_act_token, per_out_ch)
 
-        self.activation(activation, c2, c1)
+    activation_callable(c2, c1)
 
-        a2q, a2q_scale = ops.scaled_fp8_quant(
-            c2, a2_scale, use_per_token_if_dynamic=per_act_token)
+    a2q, a2q_scale = ops.scaled_fp8_quant(
+        c2, a2_scale, use_per_token_if_dynamic=per_act_token)
 
-        if expert_map is not None:
-            c3.fill_(0)
+    if expert_map is not None:
+        c3.fill_(0)
 
-        ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale,
-                           expert_offsets[:-1], problem_sizes2,
-                           self.ab_strides2, self.ab_strides2, self.c_strides2)
+    ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets,
+                       problem_sizes2, ab_strides2, ab_strides2, c_strides2,
+                       per_act_token, per_out_ch)
 
-        c3 = c3[c_map]
+    if is_pplx:
+        return c3.reshape(local_E, padded_M, K)
+    else:
+        return c3[c_map].view(M, topk, K)
 
-        return c3
+
+class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        out_dtype: torch.dtype,
+        per_act_token: bool,
+        per_out_ch: bool,
+    ):
+        super().__init__()
+        self.max_experts_per_worker = max_experts_per_worker
+        self.out_dtype = out_dtype
+        self.per_act_token = per_act_token
+        self.per_out_ch = per_out_ch
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        padded_M = aq.shape[1]
+        workspace1 = self.max_experts_per_worker * padded_M * max(N, K)
+        workspace2 = self.max_experts_per_worker * padded_M * (N // 2)
+        return (workspace1, workspace2, self.out_dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+        activation_callable = lambda i, o: self.activation(activation, i, o)
+        return run_cutlass_moe_fp8(hidden_states, w1, w2, topk_ids,
+                                   activation_callable, global_num_experts,
+                                   expert_map, w1_scale, w2_scale, a1q_scale,
+                                   a2_scale, workspace13, workspace2,
+                                   expert_num_tokens, self.out_dtype,
+                                   self.per_act_token, self.per_out_ch)
 
 
-#TODO make the grouped gemm kernel consistent with scaled gemm kernel
 def cutlass_moe_fp8(
     a: torch.Tensor,
     w1_q: torch.Tensor,
     w2_q: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    ab_strides1: torch.Tensor,
-    c_strides1: torch.Tensor,
-    ab_strides2: torch.Tensor,
-    c_strides2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    activation: str = "silu",
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    out_dtype: torch.dtype = torch.half,
     expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -206,25 +286,17 @@ def cutlass_moe_fp8(
         Shape: [num_experts, K, 2N] (the weights are passed transposed)
     - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
         Shape: [num_experts, N, K] (the weights are passed transposed)
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mappings.
     - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
         Shape: [num_experts] or [num_experts, 2N]
     - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
         Shape: [num_experts] or [num_experts, K]
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - ab_strides1 (torch.Tensor): The input and weights strides of the first
-        grouped gemm.
-    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
-    - ab_strides2 (torch.Tensor): The input and weights strides of the second
-        grouped gemm.
-    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
     - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
         Shape: scalar or [M]
     - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
         quantize the intermediate result between the gemms.
         Shape: scalar or [M]
-    - out_dtype (torch.dtype): The output tensor type.
     - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
         every Rank is responsible for a subset of experts. expert_map is a
         mapping from global expert-id to local expert-id. When expert_map[i]
@@ -232,24 +304,27 @@ def cutlass_moe_fp8(
         expert-id i.
     - apply_router_weight_on_input (bool): When true, the topk weights are
         applied directly on the inputs. This is only applicable when topk is 1.
+    - global_num_experts (int): The total number of experts.
 
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
     per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
         a2_scale.numel() != 1 if a2_scale is not None else False)
+    per_out_ch = w1_scale.numel() != w1_q.shape[0]
+
+    out_dtype = a.dtype
 
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(
-            per_channel_quant=per_act_token,
             quant_dtype=torch.float8_e4m3fn,
+            per_channel_quant=per_act_token,
         ),
         CutlassExpertsFp8(
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
-            out_dtype,
+            max_experts_per_worker=global_num_experts,
+            out_dtype=out_dtype,
+            per_act_token=per_act_token,
+            per_out_ch=per_out_ch,
         ),
     )
 
@@ -259,9 +334,12 @@ def cutlass_moe_fp8(
         w2_q,
         topk_weights,
         topk_ids,
-        expert_map=expert_map,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
+        False,
+        activation,
+        global_num_experts if global_num_experts != -1 else w1_q.size(0),
+        expert_map,
+        w1_scale,
+        w2_scale,
         a1_scale=a1_scale,
         a2_scale=a2_scale,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -332,6 +410,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     num_topk = topk_ids.shape[1]
 
     expert_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
+    blockscale_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
     # Problem size:  (num_experts, (m,2n,k))
     problem_sizes1 = torch.empty((e, 3), dtype=torch.int32, device=device)
     # Problem size:  (num_experts, (m,n,k))
@@ -343,12 +422,10 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     # problem shapes should have [m, n, k]
     # Note that problem sizes are based on logical number of elements.
     ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
-                                problem_sizes2, a_map, c_map, e, n, k)
+                                problem_sizes2, a_map, c_map, e, n, k,
+                                blockscale_offsets)
 
-    tokens_per_expert = problem_sizes1[:, 0]
-    rounded_tokens_per_expert = (tokens_per_expert + (128 - 1)) // 128 * 128
-    blockscale_offsets = torch.zeros(e + 1, dtype=torch.int32, device=device)
-    blockscale_offsets[1:] = torch.cumsum(rounded_tokens_per_expert, dim=0)
+    a = ops.shuffle_rows(a, a_map)
 
     rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant(
         a,
@@ -356,7 +433,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
         expert_offsets,
         blockscale_offsets,
         num_topk,
-        expert_map=a_map)
+    )
 
     c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale,
                                 w1_blockscale, w1_alphas, problem_sizes1,
@@ -377,6 +454,8 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
                                 w2_alphas, problem_sizes2, expert_offsets[:-1],
                                 blockscale_offsets[:-1], out_dtype, device)
     del int_fp4, int_blockscale
-    out = (c2[c_map].view(m, num_topk, k) *
+
+    c2 = ops.shuffle_rows(c2, c_map)
+    out = (c2.view(m, num_topk, k) *
            topk_weights.view(m, num_topk, 1).half()).sum(dim=1)
     return out.to(dtype=out_dtype)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 46a814e6ecc3cc3d164cd936ff4d2dc58a3ef467..436c632be9c40a6881de9611fbafd7df71b38121 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import importlib.util
 from typing import Optional
@@ -11,8 +12,8 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_permute)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize,
-                                                        _resize_cache)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
 from vllm.utils import round_up
 
 logger = init_logger(__name__)
@@ -33,10 +34,8 @@ def _valid_deep_gemm_shape(M: int, N: int, K: int):
     return align <= M and N % align == 0 and K % align == 0
 
 
-def _valid_deep_gemm(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     expert_map: Optional[torch.Tensor] = None) -> bool:
+def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
+                     w2: torch.Tensor) -> bool:
     """
     Check if the given problem size is supported by the DeepGemm grouped
     gemm kernel.  All of M, N, K and the quantization block_shape must be
@@ -46,10 +45,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor,
         logger.debug("DeepGemm disabled: deep_gemm not available.")
         return False
 
-    if expert_map is not None:
-        logger.debug("DeepGemm disabled: expert map NYI.")
-        return False
-
     M = hidden_states.size(0)
     _, K, N = w2.size()
     if not _valid_deep_gemm_shape(M, N, K):
@@ -78,17 +73,20 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         num_experts: int,
     ) -> tuple[int, int, torch.dtype]:
+
         block_m = self.block_shape[0]
         M_sum = (M * topk) + num_experts * (block_m - 1)
         M_sum = round_up(M_sum, block_m)
         workspace1 = M_sum * max(N * 2, K)
-        workspace2 = M_sum * N
+        workspace2 = M_sum * max(N, K)
+
         return (workspace1, workspace2, a.dtype)
 
     def apply(
@@ -115,7 +113,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         a1q = hidden_states
         _, N, K = w1.size()
 
-        assert global_num_experts != -1
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
         assert w2.size(1) == K
 
         a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute(
@@ -127,28 +127,41 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             self.block_shape[0],
         )
 
+        if expert_map is not None:
+            # DeepGemm (Grouped Contiguous) kernel needs a valid B index
+            # for all rows of A. To that effect, simply compute with
+            # the 0th weight matrix.
+            # Note that this relies on the fact that corresponding topk
+            # weights would be 0 during weight multiplication.
+            expert_ids = torch.where(expert_ids == -1, 0, expert_ids)
+
         # Note: M_sum is different than the pre-permuted shape of a1q.
         M_sum = a1q.size(0)
-        workspace1 = _resize_cache(workspace13, (M_sum, N))
-        workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
-        workspace3 = _resize_cache(workspace13, (M_sum, K))
+
+        mm1_out = _resize_cache(workspace13, (M_sum, N))
+        act_out = _resize_cache(workspace2, (M_sum, N // 2))
+        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
+                                  (M_sum, N // 2))
+        mm2_out = _resize_cache(workspace2, (M_sum, K))
+        out = _resize_cache(workspace13, (inv_perm.size(0), K))
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)
+            (a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)
 
-        self.activation(activation, workspace2, workspace1.view(-1, N))
+        self.activation(activation, act_out, mm1_out.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
-
-        a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False,
-                                       self.block_shape)
+        a2q, a2q_scale = per_token_group_quant_fp8(act_out,
+                                                   self.block_shape[1],
+                                                   column_major_scales=True,
+                                                   out_q=quant_out)
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
+            (a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids)
 
-        workspace3 = workspace3[inv_perm, ...]
+        torch.index_select(mm2_out, 0, inv_perm, out=out)
 
-        return workspace3
+        return out
 
 
 def deep_gemm_moe_fp8(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c21d8aa53a648ddae269fc71492b56e31c50f9e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+
+class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 world_size: int,
+                 rank: int,
+                 dp_size: int,
+                 rank_expert_offset: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None):
+        super().__init__()
+        self.buffer = buffer
+        self.world_size = world_size
+        self.rank = rank
+        self.dp_size = dp_size
+        self.rank_expert_offset = rank_expert_offset
+        self.quant_dtype = quant_dtype
+        self.block_shape = block_shape
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+        # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
+        self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_dispatch_config(self.dp_size)
+
+    def _get_combine_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_combine_config(self.dp_size)
+
+    def _do_quant(self, tokens: torch.Tensor,
+                  token_scales: Optional[torch.Tensor], per_act_token: bool):
+        tokens, token_scales = moe_kernel_quantize_input(
+            tokens, token_scales, self.quant_dtype, per_act_token,
+            self.block_shape)
+        return tokens, token_scales
+
+    def _do_dispatch(self, tokens: torch.Tensor,
+                     token_scales: Optional[torch.Tensor],
+                     rank_topk_ids: torch.Tensor,
+                     rank_topk_weights: torch.Tensor, num_experts: int):
+
+        has_scales = token_scales is not None
+
+        (num_tokens_per_rank, num_tokens_per_rdma_rank, expert_num_tokens,
+         is_token_in_rank, event) = self.buffer.get_dispatch_layout(
+             topk_idx=rank_topk_ids,
+             num_experts=num_experts,
+             previous_event=None,
+             async_finish=False,
+             allocate_on_comm_stream=False)
+
+        token_data = tokens
+        if has_scales:
+            token_data = (tokens, token_scales)
+
+        (
+            token_data, expert_topk_ids, expert_topk_weights,
+            expert_num_tokens_per_expert_list, self.handle, event
+        ) = self.buffer.dispatch(
+            x=token_data,
+            handle=None,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=expert_num_tokens,
+            topk_idx=rank_topk_ids,
+            topk_weights=rank_topk_weights,
+            # expert_alignment rounds the number of tokens per expert
+            # to this value.
+            expert_alignment=1,
+            config=self._get_dispatch_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+
+        if has_scales:
+            expert_x, expert_x_scale = token_data
+        else:
+            expert_x, expert_x_scale = token_data, None
+
+        # The existing MOE kernels assume that all entries of topk_ids are
+        # valid. To that effect, set the -1s in expert_topk_ids to some expert
+        # outside this rank so the expert_map can remap it to -1 when safe.
+        # With Expert Parallel, the experts are divided amongst the rank
+        # sequentially. For rank 0, set it to num_experts - 1 and for all other
+        # ranks set it to 0 as we know that expert_map will have a -1 in those
+        # regions for those ranks.
+        #
+        # DeepEP's topk_ids output refers to the local experts directly. Offset
+        # the topk_ids to move it back to the global experts space so it aligns
+        # with existing vLLM interfaces.
+        expert_topk_ids = torch.where(
+            expert_topk_ids == -1,
+            num_experts - 1 if self.rank_expert_offset == 0 else 0,
+            expert_topk_ids + self.rank_expert_offset)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+                expert_topk_weights)
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+
+        # Check if there is a block_shape / or if we can infer the quantization
+        # schemes from the scales.
+        per_token_quant = None
+        if all([x is None for x in [self.block_shape, a1_scale, a2_scale]
+                ]) and self.quant_dtype is not None:
+            # Quantization required despite none of the inputs suggesting
+            # quantization. Fallback to per_token_dynamic quant.
+            per_token_quant = True
+        else:
+            per_token_quant = ((self.block_shape is not None) or
+                               (a1_scale is not None and a1_scale.numel() != 1)
+                               or (a2_scale is not None
+                                   and a2_scale.numel() != 1))
+
+        if per_token_quant:
+            a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True)
+            (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1q,
+                 token_scales=a1q_scale,
+                 rank_topk_ids=rank_topk_ids,
+                 rank_topk_weights=rank_topk_weights,
+                 num_experts=num_experts)
+        else:
+            # DeepEP kernels only support dispatching per-token-quant
+            # quantization. dispatch in bfloat16.
+            (expert_x, _, expert_num_tokens, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1,
+                 token_scales=None,
+                 rank_topk_ids=rank_topk_ids,
+                 rank_topk_weights=rank_topk_weights,
+                 num_experts=num_experts)
+            # quantize now
+            expert_x_scale = None
+            if expert_x.numel() != 0:
+                expert_x, expert_x_scale = self._do_quant(expert_x,
+                                                          a1_scale,
+                                                          per_act_token=False)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+                expert_topk_weights)
+
+    def _apply_weights_and_reduce(self, num_tokens: int,
+                                  fused_expert_output: torch.Tensor,
+                                  topk_weights: torch.Tensor,
+                                  apply_router_weight_on_input: bool,
+                                  output_dtype: torch.dtype):
+
+        hidden_dim = fused_expert_output.size(-1)
+        if fused_expert_output.ndim == 2:
+            fused_expert_output = fused_expert_output.view(
+                num_tokens, -1, hidden_dim)
+
+        if not apply_router_weight_on_input:
+            # The DeepEP combine kernels don't do the topk weight
+            # multiplication. We multiply the weights locally.
+            m_x_topk = fused_expert_output.size(0)
+            fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1))
+
+        out = torch.empty((num_tokens, hidden_dim),
+                          device=fused_expert_output.device,
+                          dtype=output_dtype)
+        ops.moe_sum(fused_expert_output, out)
+
+        return out
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool) -> None:
+
+        assert self.handle is not None
+
+        # fused_expert_output can have 0 tokens - This happens when none of the
+        # tokens from the all2all reach this EP rank.
+        if fused_expert_output.numel() != 0:
+            fused_expert_output = self._apply_weights_and_reduce(
+                num_tokens=topk_ids.size(0),
+                fused_expert_output=fused_expert_output,
+                topk_weights=topk_weights,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                output_dtype=output.dtype)
+
+        combined_x, _, event = self.buffer.combine(
+            x=fused_expert_output,
+            handle=self.handle,
+            topk_weights=None,
+            config=self._get_combine_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+        # Respect inplace outputs.
+        output.copy_(combined_x, non_blocking=True)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..3484a7a8a496a73dcbdd64aa1d821b124d9ed0c8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Union
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+# DeepEP kernels quantize dispatch inputs in 128 element chunks.
+DEEPEP_QUANT_BLOCK_SIZE = 128
+
+
+def dequant_fp8(expert_x_fp8: torch.Tensor,
+                expert_x_scales: torch.Tensor) -> torch.Tensor:
+    """
+    Return dequantized tensor in fp32
+    """
+    # TODO (varun) : Optimize leverage num_tokens_per_expert counts
+    assert expert_x_fp8.is_contiguous()
+    expert_x_scales = expert_x_scales.contiguous()
+    num_experts = expert_x_fp8.size(0)
+
+    expert_x_fp32 = expert_x_fp8.to(torch.float32).view(
+        num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE)
+    expert_x_scales = expert_x_scales.view(num_experts, -1, 1)
+    return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.shape)
+
+
+class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP low-latency kernels.
+    """
+
+    # DeepEP low-latency kernels are compiled only for certain
+    # specific hidden sizes.
+    SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168]
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 world_size: int,
+                 dp_size: int,
+                 max_tokens_per_rank: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None,
+                 use_fp8_dispatch: bool = False):
+        super().__init__()
+
+        self.buffer = buffer
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.quant_dtype = quant_dtype
+        self.block_shape = block_shape
+        self.max_tokens_per_rank = max_tokens_per_rank
+        self.use_fp8_dispatch = use_fp8_dispatch
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_tokens_per_rank
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _do_quant(
+            self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+            a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor],
+            a1_dtype: torch.dtype
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        block_k = self.block_shape[1] if self.block_shape is not None else None
+        if self.use_fp8_dispatch:
+            if block_k == DEEPEP_QUANT_BLOCK_SIZE:
+                # DeepEP kernels did the quantization for us.
+                x, x_scales = x
+                return x, x_scales
+
+            # Dequant to get back the tokens in the datatype we dispatched in.
+            x_fp8, x_scales = x
+            x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
+
+        assert isinstance(x, torch.Tensor)
+
+        # Check if there is a block_shape / or if we can infer the quantization
+        # schemes from the scales.
+        per_token_quant = None
+        if all([v is None for v in [self.block_shape, a1_scale, a2_scale]
+                ]) and self.quant_dtype is not None:
+            # Quantization required despite none of the inputs suggesting
+            # quantization. Fallback to per_token_dynamic quant.
+            per_token_quant = True
+        else:
+            per_token_quant = ((self.block_shape is not None) or
+                               (a1_scale is not None and a1_scale.numel() != 1)
+                               or (a2_scale is not None
+                                   and a2_scale.numel() != 1))
+
+        num_experts, max_tokens, hidden_dim = x.size()
+
+        # TODO (varun): Optimization - Use a batched version of quant
+        x = x.view((-1, hidden_dim))
+        x, x_scales = moe_kernel_quantize_input(x, a1_scale, self.quant_dtype,
+                                                per_token_quant,
+                                                self.block_shape)
+        x = x.view((num_experts, -1, hidden_dim))
+
+        if per_token_quant:
+            assert x_scales is not None
+            x_scales = x_scales.view(num_experts, max_tokens, -1)
+
+        return x, x_scales
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        hidden_size = a1.size(1)
+        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
+            (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
+            f"{self.SUPPORTED_HIDDEN_SIZES}")
+
+        if self.use_fp8_dispatch:
+            assert hidden_size % 128 == 0, \
+            "DeepEP kernels quantize the inputs in blocks of shape 128"
+
+        has_per_token_scales = a1_scale.numel(
+        ) != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
+        assert not has_per_token_scales, (
+            "low_latency kernels doesn't support dispatching per-token scales")
+
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+
+        # Dispatch
+        expert_x, expert_num_tokens, self.handle, event, hook = \
+                self.buffer.low_latency_dispatch(a1,
+                                                rank_topk_ids,
+                                                self.max_tokens_per_rank,
+                                                num_experts,
+                                                use_fp8=self.use_fp8_dispatch,
+                                                async_finish=False,
+                                                return_recv_hook=False)
+
+        expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
+                                                  a1.dtype)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, None, None)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool) -> None:
+
+        assert self.handle is not None
+
+        combine_topk_weights = topk_weights
+        if apply_router_weight_on_input:
+            # weights have already been applied.
+            combine_topk_weights = torch.ones_like(topk_weights)
+
+        # TODO (varun) : Enable zero copy mode
+        _, event, hook = self.buffer.low_latency_combine(
+            fused_expert_output,
+            topk_ids,
+            combine_topk_weights,
+            self.handle,
+            async_finish=False,
+            zero_copy=False,
+            return_recv_hook=False,
+            out=output)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index c2db79365931260d1e9bec2b1c337034bba15cfb..68a3485ff1f6a2166a0a573ed4b8268460b6a69f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
 from typing import Optional
 
@@ -9,7 +10,8 @@ import triton.language as tl
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
-from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, moe_kernel_quantize_input)
 
 
 @triton.jit
@@ -396,6 +398,12 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.rank = rank
         self.max_num_tokens = max_num_tokens
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -406,7 +414,8 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         assert a1.dim() == 2
         assert topk_ids.dim() == 2
         assert topk_ids.size(0) == a1.size(0)
@@ -449,7 +458,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  first_expert, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[expert_id - first_expert] = rows
 
-        return b_a1, a1_scale, tokens_per_expert
+        return b_a1, a1_scale, tokens_per_expert, None, None
 
     def finalize(
         self,
@@ -512,6 +521,7 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -600,6 +610,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         block_shape: Optional[list[int]] = None,
         world_size: int = 1,
         dp_size: int = 1,
@@ -610,15 +621,19 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         self.use_int4_w4a16 = use_int4_w4a16
         self.use_int8_w8a16 = use_int8_w8a16
         self.block_shape = block_shape
+        self.per_channel_quant = per_channel_quant
         self.max_num_tokens = max_num_tokens
-        assert not use_int8_w8a8, "NYI"
-        assert not use_int4_w4a16, "NYI"
         self.world_size = world_size
         self.dp_size = dp_size
 
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int4_w4a16, "NYI"
+        assert self.block_shape is None, "NYI"
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -669,8 +684,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
         ]
 
-        # TODO: num_tokens -> max_num_tokens?
-        E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
             hidden_states, w1, w2, topk_ids)
 
         assert w1.size(0) == E
@@ -686,7 +700,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             w2.size(),
             top_k_num,
             config_dtype,
-            num_tokens,
+            max_num_tokens,
             block_shape=self.block_shape,
         )
 
@@ -705,10 +719,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}")
         # We can reuse the memory between these because by the time we need
         # cache3, we're done with cache1
-        intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N))
+        intermediate_cache1 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, N))
         intermediate_cache2 = _resize_cache(workspace2,
-                                            (E, num_tokens, N // 2))
-        intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K))
+                                            (E, max_num_tokens, N // 2))
+        intermediate_cache3 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, K))
 
         # MM1
         invoke_moe_batched_triton_kernel(A=hidden_states,
@@ -730,15 +746,20 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         self.activation(activation, intermediate_cache2.view(-1, N // 2),
                         intermediate_cache1.view(-1, N))
 
-        #qintermediate_cache2 = intermediate_cache2
-        a2q_scale = a2_scale
-        # TODO (varun) : support w8a8
-        assert not self.use_fp8_w8a8
-        #if self.use_fp8_w8a8:
-        #    qintermediate_cache2, a2q_scale = _fp8_quantize(
-        #        intermediate_cache2, a2_scale, self.block_shape)
+        ic2_hidden_size = intermediate_cache2.size(-1)
+        intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size)
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            A=intermediate_cache2,
+            A_scale=a2_scale,
+            qtype=torch.float8_e4m3fn if self.use_fp8_w8a8 else None,
+            per_channel_quant=self.per_channel_quant,
+            block_shape=self.block_shape)
 
-        invoke_moe_batched_triton_kernel(A=intermediate_cache2,
+        qintermediate_cache2 = qintermediate_cache2.view(
+            (E, -1, ic2_hidden_size))
+
+        invoke_moe_batched_triton_kernel(A=qintermediate_cache2,
                                          B=w2,
                                          C=intermediate_cache3,
                                          expert_num_tokens=expert_num_tokens,
@@ -751,5 +772,4 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                          use_int4_w4a16=self.use_int4_w4a16,
                                          config=config,
                                          block_shape=self.block_shape)
-
         return intermediate_cache3
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4c84dd5383320fede6162868f31bc62c4e7aa683..40b76994f412c84b756c2113ef72edca73f78e16 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
 import functools
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 0e18491c4dee1af0eaf529375b90e61064c8c5e1..12d36a94733437f574c34d96d56949da97f260b9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE kernel."""
 import functools
 import json
@@ -1194,7 +1195,7 @@ def fused_experts(hidden_states: torch.Tensor,
     # permute/unpermute ops are available.
     N = w1.shape[1]
     if (allow_deep_gemm and use_fp8_w8a8 and N > 512
-            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+            and _valid_deep_gemm(hidden_states, w1, w2)):
         assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
@@ -1588,6 +1589,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5ee5bfa92f5ea122efa88148b1d8de6c32c3efab..47cb8032a4a0f6228a60e2e5511aa0df50d75aff 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import importlib
@@ -6,10 +7,13 @@ import importlib
 from abc import abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
@@ -31,23 +35,26 @@ from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import direct_register_custom_op
 
 has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+has_deepep = importlib.util.find_spec("deep_ep") is not None
 
 if current_platform.is_cuda_alike():
-    from .fused_batched_moe import (BatchedPrepareAndFinalize,
-                                    BatchedTritonExperts)
+    from .fused_batched_moe import BatchedTritonExperts
     from .fused_moe import TritonExperts, fused_experts
     from .modular_kernel import (FusedMoEModularKernel,
                                  FusedMoEPermuteExpertsUnpermute,
                                  FusedMoEPrepareAndFinalize)
     if has_pplx:
         from .pplx_prepare_finalize import PplxPrepareAndFinalize
+    if has_deepep:
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
     FusedMoEPrepareAndFinalize = None  # type: ignore
 if is_rocm_aiter_moe_enabled():
     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-        rocm_aiter_biased_group_topk as grouped_topk)
+        rocm_aiter_grouped_topk as grouped_topk)
 else:
     from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
 if current_platform.is_tpu():
@@ -72,10 +79,24 @@ class FusedMoEParallelConfig:
 
     use_ep: bool  # whether to use EP or not
 
+    @property
+    def use_all2all_kernels(self):
+        return self.dp_size > 1 and self.use_ep
+
     @property
     def use_pplx_kernels(self):
-        return self.dp_size > 1 and self.use_ep and \
-             envs.VLLM_ALL2ALL_BACKEND == "pplx"
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
@@ -194,6 +215,7 @@ class MoEConfig:
     moe_parallel_config: FusedMoEParallelConfig
 
     in_dtype: torch.dtype  # The activation type.
+    quant_dtype: torch.dtype = None
 
     # TODO: add more quantization params, blocked, per-token, etc.
     block_size: int = 128
@@ -232,6 +254,14 @@ class MoEConfig:
     def use_pplx_kernels(self):
         return self.moe_parallel_config.use_pplx_kernels
 
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
 
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
@@ -240,8 +270,22 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
+def get_quant_config_input_activations(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
+            and "Linear" in quant_config.target_scheme_map and
+            "input_activations" in quant_config.target_scheme_map["Linear"]):
+        return quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+    else:
+        return None
+
+
 class FusedMoEMethodBase(QuantizeMethodBase):
 
+    moe: MoEConfig
+
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -253,7 +297,17 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
-        prepare_finalize = None
+        self.moe = moe
+        quant_dtype = None
+        act_quant_block_size = None
+        from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+        if isinstance(quant_config, Fp8Config):
+            act_quant_block_size = quant_config.weight_block_size
+            quant_dtype = torch.float8_e4m3fn
+
+        prepare_finalize: Optional[Union[PplxPrepareAndFinalize,
+                                         DeepEPHTPrepareAndFinalize,
+                                         DeepEPLLPrepareAndFinalize]] = None
         if moe.use_pplx_kernels:
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,
@@ -264,18 +318,26 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
                 hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
+                hidden_dim_bytes=moe.hidden_dim * moe.quant_dtype.itemsize,
                 # For blocked per token: set to
                 #   ceil_div(hidden_dim, block_size) * sizeof(float32)
                 # For per-token: set to sizeof(float32)
-                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
-                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
-                    torch.float32.itemsize)),
-                group_name=all2all_manager.cpu_group.group_name,
+                hidden_dim_scale_bytes=(
+                    0 if moe.quant_dtype.itemsize != 1 else
+                    ((moe.hidden_dim + moe.block_size - 1) // moe.block_size *
+                     torch.float32.itemsize)),
             )
 
+            # Intranode pplx a2a takes a group name while internode does not.
+            if not all2all_manager.internode:
+                all_to_all_args[
+                    "group_name"] = all2all_manager.cpu_group.group_name
+
             handle = all2all_manager.get_handle(all_to_all_args)
 
+            input_activations = get_quant_config_input_activations(
+                quant_config)
+
             prepare_finalize = PplxPrepareAndFinalize(
                 handle,
                 max_num_tokens=moe.max_num_tokens,
@@ -283,19 +345,63 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 rank=all2all_manager.rank,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
-                quant_dtype=moe.in_dtype,
+                quant_dtype=moe.quant_dtype,
+                per_act_token=(input_activations.strategy
+                               == QuantizationStrategy.TOKEN
+                               if input_activations is not None else False),
+            )
+        elif moe.use_deepep_ht_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
+
+            all_to_all_args = dict()
+            handle = all2all_manager.get_handle(all_to_all_args)
+            prepare_finalize = DeepEPHTPrepareAndFinalize(
+                handle,
+                world_size=all2all_manager.world_size,
+                rank=all2all_manager.rank,
+                dp_size=all2all_manager.dp_world_size,
+                rank_expert_offset=all2all_manager.rank *
+                moe.num_local_experts,
+                quant_dtype=quant_dtype,
+                block_shape=act_quant_block_size,
             )
 
+        elif moe.use_deepep_ll_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
+
+            all_to_all_args = dict(
+                max_num_tokens_per_dp_rank=moe.max_num_tokens,
+                token_hidden_size=moe.hidden_dim,
+                num_ep_ranks=all2all_manager.world_size,
+                num_global_experts=moe.num_experts,
+                num_local_experts=moe.num_experts //
+                all2all_manager.world_size)
+            handle = all2all_manager.get_handle(all_to_all_args)
+
+            # Note (varun): Whether to use FP8 dispatch or not needs some
+            # profiling. Turning it off for now.
+            prepare_finalize = DeepEPLLPrepareAndFinalize(
+                handle,
+                world_size=all2all_manager.world_size,
+                dp_size=all2all_manager.dp_world_size,
+                max_tokens_per_rank=moe.max_num_tokens,
+                quant_dtype=quant_dtype,
+                block_shape=act_quant_block_size,
+                use_fp8_dispatch=False,
+            )
+
+        self.topk_indices_dtype = None
         if prepare_finalize is not None:
-            experts = self.select_gemm_impl(prepare_finalize)
+            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
+            experts = self.select_gemm_impl(prepare_finalize, moe)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
             )
 
     def select_gemm_impl(
-        self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
-    ) -> FusedMoEPermuteExpertsUnpermute:
+            self, prepare_finalize: FusedMoEPrepareAndFinalize,
+            moe: Optional[MoEConfig]) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
         raise NotImplementedError(
@@ -331,6 +437,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def __init__(self, moe: MoEConfig):
         super().__init__()
         self.fused_experts = fused_experts  # type: ignore
+        self.topk_indices_dtype = None
         self.moe = moe
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
@@ -340,8 +447,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
-    def select_gemm_impl(
-            self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]):
+    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
+                         moe: Optional[MoEConfig]):
 
         assert self.fused_experts == fused_experts
 
@@ -350,11 +457,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
         experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
 
-        if isinstance(prepare_finalize,
-                      (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+        use_batched_experts = prepare_finalize.max_num_tokens_per_rank(
+        ) is not None
+        if use_batched_experts:
             logger.debug("BatchedTritonExperts %s", self.moe)
+            assert self.moe.dp_size == all2all_manager.dp_world_size
             experts = BatchedTritonExperts(
-                max_num_tokens=MOE_DP_CHUNK_SIZE,
+                max_num_tokens=self.moe.max_num_tokens,
                 world_size=all2all_manager.world_size,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
@@ -363,6 +472,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 use_int8_w8a16=False,
                 use_int4_w4a16=False,
                 block_shape=None,
+                per_channel_quant=False,
             )
         else:
             logger.debug("TritonExperts %s", self.moe)
@@ -508,6 +618,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         activation: str = "silu",
         use_nn_moe: Optional[bool] = False,
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -519,7 +630,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=torch.uint32 if self.moe.use_pplx_kernels else None)
+            indices_type=self.topk_indices_dtype)
 
         if self.rocm_aiter_moe_enabled:
             assert expert_map is None
@@ -747,7 +858,6 @@ class FusedMoE(torch.nn.Module):
         activation: str = "silu",
     ):
         super().__init__()
-
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
@@ -807,14 +917,24 @@ class FusedMoE(torch.nn.Module):
             from vllm_hpu_extension.ops import DynamicFusedMOE
             self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
+        # Only support float8 for now.
+        quant_dtype = params_dtype
+        if quant_config is not None:
+            input_activations = get_quant_config_input_activations(
+                quant_config)
+            if (input_activations is not None
+                    and input_activations.num_bits == 8
+                    and input_activations.type == QuantizationType.FLOAT):
+                quant_dtype = torch.float8_e4m3fn
+
         moe = MoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
             hidden_dim=hidden_size,
             num_local_experts=self.local_num_experts,
             moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
             in_dtype=params_dtype,
+            quant_dtype=quant_dtype,
             max_num_tokens=MOE_DP_CHUNK_SIZE,
         )
         self.moe_config = moe
@@ -823,11 +943,8 @@ class FusedMoE(torch.nn.Module):
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
         quant_method: Optional[QuantizeMethodBase] = None
-
-        if quant_config is None:
-            quant_method = UnquantizedFusedMoEMethod(moe)
-        else:
-            quant_method = quant_config.get_quant_method(self, prefix)
+        quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
+                        else quant_config.get_quant_method(self, prefix))
 
         assert quant_method is not None
         assert isinstance(quant_method, FusedMoEMethodBase)
@@ -858,6 +975,22 @@ class FusedMoE(torch.nn.Module):
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
+        # Chunked all2all staging tensor
+        self.batched_hidden_states: Optional[torch.Tensor] = None
+        self.batched_router_logits: Optional[torch.Tensor] = None
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels):
+            act_dtype = vllm_config.model_config.dtype
+            self.batched_hidden_states = torch.zeros(
+                (MOE_DP_CHUNK_SIZE, self.hidden_size),
+                dtype=act_dtype,
+                device=torch.cuda.current_device())
+
+            self.batched_router_logits = torch.zeros(
+                (MOE_DP_CHUNK_SIZE, self.global_num_experts),
+                dtype=act_dtype,
+                device=torch.cuda.current_device())
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -890,6 +1023,14 @@ class FusedMoE(torch.nn.Module):
     def use_pplx_kernels(self):
         return self.moe_parallel_config.use_pplx_kernels
 
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
     def _load_per_tensor_weight_scale(self, shard_id: str,
                                       param: torch.nn.Parameter,
                                       loaded_weight: torch.Tensor,
@@ -1233,19 +1374,21 @@ class FusedMoE(torch.nn.Module):
         When just tensor-parallel is used, it is not required to reduce
         the shared_experts results immediately. Instead we reduce at the
         once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
-        With EP and the pplx kernels - this is no longer viable as all
+        With EP and all2all kernels - this is no longer viable as all
         GPU ranks in DP, produce the complete set of hidden_states.
         Therefore it is required that we reduce the shared_experts output
         early.
         """
-        return self.use_pplx_kernels
+        return (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels)
 
     def maybe_all_reduce_tensor_model_parallel(
             self, final_hidden_states: torch.Tensor):
         """
         The pplx combine kernel reduces across GPU ranks by default.
         """
-        if self.use_pplx_kernels:
+        if (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels):
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
@@ -1260,18 +1403,39 @@ class FusedMoE(torch.nn.Module):
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):
+        assert self.batched_hidden_states is not None
+        assert self.batched_router_logits is not None
+        assert self.batched_hidden_states.dtype == full_hidden_states.dtype
+        assert self.batched_router_logits.dtype == full_router_logits.dtype
+        # Check size compatibility.
+        assert (
+            self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
+        assert (
+            self.batched_router_logits.size(-1) == full_router_logits.size(-1))
 
         full_final_hidden_states = torch.empty_like(full_hidden_states)
 
         def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+            chunk_size = chunk_end - chunk_start
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
+            assert (self.batched_hidden_states.size(0)  # type: ignore
+                    >= chunk_size)
+            assert (self.batched_router_logits.size(0)  # type: ignore 
+                    >= chunk_size)
+            staged_hidden_states = self.batched_hidden_states[:
+                                                              chunk_size, :]  # type: ignore
+            staged_router_logits = self.batched_router_logits[:
+                                                              chunk_size, :]  # type: ignore
+            staged_hidden_states.copy_(hidden_states, non_blocking=True)
+            staged_router_logits.copy_(router_logits, non_blocking=True)
+
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
-                x=hidden_states,
-                router_logits=router_logits,
+                x=staged_hidden_states,
+                router_logits=staged_router_logits,
                 top_k=self.top_k,
                 renormalize=self.renormalize,
                 use_grouped_topk=self.use_grouped_topk,
@@ -1287,11 +1451,11 @@ class FusedMoE(torch.nn.Module):
 
             if not skip_result_store:
                 full_final_hidden_states[chunk_start:chunk_end, :].copy_(
-                    final_hidden_states)
+                    final_hidden_states, non_blocking=True)
 
         ctx = get_forward_context()
         max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
-        moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
+        moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
 
         num_tokens = full_hidden_states.size(0)
         for chunk_start_ in range(0, max_tokens_across_dp,
@@ -1312,12 +1476,17 @@ class FusedMoE(torch.nn.Module):
     def forward_impl(self, hidden_states: torch.Tensor,
                      router_logits: torch.Tensor):
         assert self.quant_method is not None
-        if self.moe_parallel_config.use_pplx_kernels:
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels):
             return self.forward_impl_chunked(hidden_states, router_logits)
 
-        if self.dp_size > 1:
+        do_naive_dispatch_combine: bool = (
+            self.dp_size > 1
+            and not self.moe_parallel_config.use_deepep_ht_kernels)
+        if do_naive_dispatch_combine:
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -1338,12 +1507,12 @@ class FusedMoE(torch.nn.Module):
             use_nn_moe=self.use_nn_moe,
         )
 
-        if self.dp_size > 1:
+        if do_naive_dispatch_combine:
             final_hidden_states = get_ep_group().combine(final_hidden_states)
 
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
-            # Default set to False. (May have to add shared expert outputs.)
-            final_hidden_states = tensor_model_parallel_all_reduce(
+            # Default set to False. (May have to add shared expert outputs.
+            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
                 final_hidden_states)
 
         return final_hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 7d3ddf8f14c4d9e6a3c84331f42f4e1891fe0690..e7aaf62fb340826602fc67396fd095f36a4be8c2 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Optional
 
@@ -93,7 +94,8 @@ class FusedMoEPrepareAndFinalize(ABC):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform any quantization (and/or) dispatching needed
         for this kernel.
@@ -112,6 +114,10 @@ class FusedMoEPrepareAndFinalize(ABC):
         Returns a tuple of:
         - quantized + dispatched a.
         - quantized + dispatched a1_scales.
+        - Optional tensor as big as number of local experts that contains the
+          number of tokens assigned to each local expert. 
+        - Optional dispatched expert topk IDs
+        - Optional dispatched expert topk weight 
         """
         raise NotImplementedError
 
@@ -137,6 +143,27 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        """
+        The PrepareFinalize All2All implementations generally constrain the
+        dtype of the topk_ids they support. This function returns the
+        required topk indices dtype so it can be respected.
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        """
+        Some PrepareFinalize All2All implementations are batched. Meaning,
+        they can processes only as set of tokens at a time. This
+        function returns the batch size i.e the maximum number of tokens
+        the implementation can process at a time. 
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
 
 class FusedMoEPermuteExpertsUnpermute(ABC):
     """
@@ -148,6 +175,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -260,6 +288,61 @@ class FusedMoEModularKernel(torch.nn.Module):
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
 
+    def _do_fused_experts(
+            self,
+            a1: torch.Tensor,  # input to forward fn
+            a1q: torch.Tensor,  # output of prepare fn
+            w1: torch.Tensor,
+            w2: torch.Tensor,
+            topk_ids: torch.Tensor,
+            expert_num_tokens: torch.Tensor,
+            activation: str,
+            global_num_experts: int,
+            expert_map: Optional[torch.Tensor],
+            w1_scale: Optional[torch.Tensor],
+            w2_scale: Optional[torch.Tensor],
+            w1_zp: Optional[torch.Tensor],
+            w2_zp: Optional[torch.Tensor],
+            a1q_scale: Optional[torch.Tensor],
+            a2_scale: Optional[torch.Tensor]) -> torch.Tensor:
+
+        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+
+        # Use a1 here to decipher the correct workspace datatype
+        workspace13_shape, workspace2_shape, workspace_dtype = (
+            self.fused_experts.workspace_shapes(a1, a1q, M, N, K, top_k,
+                                                global_num_experts))
+
+        # We can reuse the memory between cache1 and cache3 because by the time
+        # we need cache3, we're done with cache1
+        workspace13 = torch.zeros(workspace13_shape,
+                                  device=a1.device,
+                                  dtype=workspace_dtype)
+        workspace2 = torch.zeros(workspace2_shape,
+                                 device=a1.device,
+                                 dtype=workspace_dtype)
+
+        fused_out = self.fused_experts.apply(
+            a1q,
+            w1,
+            w2,
+            topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1q_scale=a1q_scale,
+            a2_scale=a2_scale,
+            workspace13=workspace13,
+            workspace2=workspace2,
+            expert_num_tokens=expert_num_tokens,
+        )
+
+        return fused_out
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -314,49 +397,48 @@ class FusedMoEModularKernel(torch.nn.Module):
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
         """
-        a1 = hidden_states
-        E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids)
-
-        if global_num_experts == -1:
-            global_num_experts = E
 
+        a1 = hidden_states
         output = a1 if inplace else torch.zeros_like(a1)
 
-        workspace13_shape, workspace2_shape, workspace_dtype = (
-            self.fused_experts.workspace_shapes(a1, M, N, K, top_k,
-                                                global_num_experts))
-
-        # We can reuse the memory between cache1 and cache3 because by the time
-        # we need cache3, we're done with cache1
-        workspace13 = torch.zeros(workspace13_shape,
-                                  device=a1.device,
-                                  dtype=workspace_dtype)
-        workspace2 = torch.zeros(workspace2_shape,
-                                 device=a1.device,
-                                 dtype=workspace_dtype)
-
-        a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare(
-            a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts,
-            expert_map, apply_router_weight_on_input)
-
-        fused_out = self.fused_experts.apply(
-            a1q,
-            w1,
-            w2,
-            topk_ids,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            w1_zp=w1_zp,
-            w2_zp=w2_zp,
-            a1q_scale=a1q_scale,
-            a2_scale=a2_scale,
-            workspace13=workspace13,
-            workspace2=workspace2,
-            expert_num_tokens=expert_num_tokens,
-        )
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
+        (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids,
+         _expert_topk_weights) = self.prepare_finalize.prepare(
+             a1, a1_scale, a2_scale, topk_weights, topk_ids,
+             global_num_experts, expert_map, apply_router_weight_on_input)
+        # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
+        topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
+        topk_weights = (topk_weights if _expert_topk_weights is None else
+                        _expert_topk_weights)
+
+        fused_out = None
+        if a1q.numel() == 0:
+            # This happens when none of the tokens from the all2all reach this
+            # EP rank. Also, note that this is only relevant for CUDAGraph
+            # incompatible all2all kernels like the DeepEP high-throughput
+            # kernels. CUDAGraph compatible all2all kernels like the pplx
+            # kernels and the DeepEP low-latency kernels are always batched
+            # and can never run into the tensor.numel() == 0 case.
+            fused_out = torch.empty_like(a1q).to(dtype=a1.dtype)
+        else:
+            fused_out = self._do_fused_experts(
+                a1=a1,
+                a1q=a1q,
+                w1=w1,
+                w2=w2,
+                topk_ids=topk_ids,
+                expert_num_tokens=expert_num_tokens,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1q_scale=a1q_scale,
+                a2_scale=a2_scale)
 
         self.prepare_finalize.finalize(output, fused_out, topk_weights,
                                        topk_ids, apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index d025f1257a9f658a278b9ceeb3d768fe1f610aa7..98e175b12ed456e0d23426d72af500229d4ba07f 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index babeb97308a9f26a9d1bb020c534c38d37f7bdf6..d35bd0098b3ca9cc53c116430085fbd6870fa0a6 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn.functional as F
+import torch_xla.experimental.custom_kernel  # noqa: F401
 
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
@@ -66,15 +68,10 @@ def fused_moe(
     token_indices = token_indices[topk_argsort_indices]
     group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
 
-    # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout
-    # from HF Transformers.
-    w1 = w1.transpose(1, 2)
-    w2 = w2.transpose(1, 2)
-
     x = hidden_states[token_indices]
-    x = torch.ops.xla.gmm(x, w1, group_sizes)
+    x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True)
     x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
-    x = torch.ops.xla.gmm(x, w2, group_sizes)
+    x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True)
     x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
 
     x = x * topk_weights.unsqueeze(dim=-1)
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index cb396f26c96e0c64eabd7d6dc29f045887acb558..20ee0d9f780a74d78af4e8a22df41b6a075da1a7 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
@@ -17,14 +18,14 @@ def _moe_permute(
     expert_map: Optional[torch.Tensor],
     block_m: int,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor]]:
+           torch.Tensor]:
     """
     Determine the sorted_token_ids, expert_ids for the given problem size.
     Permute the hidden states and scales according to `sorted_token_ids`.
     """
     top_k_num = curr_topk_ids.size(1)
 
-    tokens_in_chunk = curr_hidden_states.sizze(0)
+    tokens_in_chunk = curr_hidden_states.size(0)
 
     sorted_token_ids, expert_ids, num_tokens_post_padded = (
         moe_align_block_size(curr_topk_ids,
@@ -36,11 +37,12 @@ def _moe_permute(
     inv_perm: Optional[torch.Tensor] = None
 
     num_tokens = top_k_num * tokens_in_chunk
-    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
     expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
     inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
 
     # Permute according to sorted token ids.
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+
     curr_hidden_states = _fp8_perm(curr_hidden_states,
                                    sorted_token_ids // top_k_num)
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index da27633f272392137c5c57d4a19798ee77e3a9aa..6160da7329518e7aac33cfc03a4efcce2be03ccd 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn.functional as F
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 783ebebbfec9416a11a374cfd5739b498825cd5a..5bc01dbf2025e3b54b1c9ddc520b52efa694f0bf 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pplx_kernels as pplx
@@ -20,7 +21,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  rank: int,
                  dp_size: int,
                  quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None):
+                 block_shape: Optional[list[int]] = None,
+                 per_act_token: bool = False):
         super().__init__()
         assert max_num_tokens > 0
         self.a2a = a2a
@@ -30,6 +32,13 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.rank = rank
         self.dp_size = dp_size
         self.quant_dtype = quant_dtype
+        self.per_act_token = per_act_token
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.uint32
 
     def prepare(
         self,
@@ -41,7 +50,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         num_tokens = a1.size(0)  # M
         hidden_dim = a1.size(-1)  # K
 
@@ -58,13 +68,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * rank_topk_weights.to(a1.dtype)
 
-        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
-            a2_scale.numel() != 1 if a2_scale is not None else False)
+        repeat_cols = 4
+        repeat_rows = 1 if self.per_act_token else a1.shape[0]
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, (None if self.per_act_token else a1_scale), self.quant_dtype,
+            self.per_act_token, self.block_shape)
 
-        a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
-                                                   self.quant_dtype,
-                                                   per_act_token,
-                                                   self.block_shape)
+        if a1q_scale is not None:
+            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
         # rem_experts need to be 0 for pplx to work properly.
         rem_experts = num_experts % self.world_size
@@ -92,7 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                           else 1) * float32_size
             expert_x_scale = torch.empty(
                 (
-                    num_experts,
+                    num_local_experts,
                     expert_x.size(1),
                     (expert_x.size(2) + block_size - 1) // block_size,
                 ),
@@ -113,8 +124,10 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             indices=rank_topk_ids,
             bound_m=bound_m,
         )
+        if expert_x_scale is not None:
+            expert_x_scale = expert_x_scale[:, :, 0:1]
 
-        return expert_x, expert_x_scale, expert_num_tokens
+        return expert_x, expert_x_scale, expert_num_tokens, None, None
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 98f98b3bd20bca7b388743fb9ccc7f0dff6b9528..9ed95e1de9fedce1a077c38fbb7dd48a1009fd1f 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
@@ -23,6 +24,12 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         self.block_shape = block_shape
         self.quant_dtype = quant_dtype
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -33,7 +40,9 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
@@ -46,7 +55,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
                                                    self.per_channel_quant,
                                                    self.block_shape)
 
-        return a1q, a1q_scale, None
+        return a1q, a1q_scale, None, None, None
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 10b61fcda176738af04552b588e210edceca1589..d44989cce724aecf25847fa6d65566a23bad5890 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
 from functools import cache
 from typing import Optional
@@ -140,6 +141,36 @@ def rocm_aiter_biased_grouped_topk_fake(
     pass
 
 
+def rocm_aiter_grouped_topk_impl(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+
+    from aiter import grouped_topk
+
+    grouped_topk(gating_output, topk_weights, topk_ids, num_expert_group,
+                 topk_group, need_renorm, scoring_func, routed_scaling_factor)
+
+
+def rocm_aiter_grouped_topk_fake(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+    pass
+
+
 def rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -218,36 +249,54 @@ if current_platform.is_rocm():
         dispatch_key=current_platform.dispatch_key,
     )
 
+    direct_register_custom_op(
+        op_name="rocm_aiter_grouped_topk",
+        op_func=rocm_aiter_grouped_topk_impl,
+        mutates_args=["topk_weights", "topk_ids"],
+        fake_impl=rocm_aiter_grouped_topk_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
 
-def rocm_aiter_biased_group_topk(
+def rocm_aiter_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    scoring_func: str = "sigmoid",
+    scoring_func: str = "softmax",
     e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    assert scoring_func == "sigmoid", (
-        "rocm_aiter_biased_group_topk only supports 'sigmoid' scoring_func.")
-    assert e_score_correction_bias is not None, (
-        "'e_score_correction_bias' must not be None.")
     token = hidden_states.shape[0]
     device = hidden_states.device
     topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
     topk_weights = torch.empty((token, topk),
                                dtype=torch.float32,
                                device=device)
-    torch.ops.vllm.rocm_aiter_biased_grouped_topk(
-        gating_output,
-        e_score_correction_bias,
-        topk_weights,
-        topk_ids,
-        num_expert_group,
-        topk_group,
-        renormalize,
-    )
+
+    if e_score_correction_bias is not None:
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output,
+            e_score_correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+        )
+    else:
+        assert (scoring_func == "softmax" or scoring_func == "sigmoid")
+        torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scoring_func,
+        )
+
     return topk_weights, topk_ids
 
 
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 2cfe373140bb93940e1d61f1c9474c98d8479f3a..87de29444c01d6f1d04f1d5a46bf670974b433b6 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
@@ -28,13 +29,15 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                            per_channel_quant=per_channel_quant,
                                            block_shape=block_shape,
                                            block_m=block_m)
-        self.deep_gemm_expert = DeepGemmExperts()
         self.allow_deep_gemm = allow_deep_gemm
         self.use_fp8_w8a8 = use_fp8_w8a8
+        self.deep_gemm_expert = DeepGemmExperts(
+        ) if self.allow_deep_gemm else None
 
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -45,10 +48,11 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
+            assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
-                a, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, num_experts)
         else:
-            return self.triton_expert.workspace_shapes(a, M, N, K, topk,
+            return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
                                                        num_experts)
 
     def apply(
@@ -72,7 +76,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ) -> torch.Tensor:
         N = w1.size(1)
         if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512
-                and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+                and _valid_deep_gemm(hidden_states, w1, w2)):
+            assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.apply(
                 hidden_states,
                 w1,
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index d9d2520e18b3b4255672e9bd61bb06cb2c3c66f1..692482c2ea692fda7f90060c71f5bb429cb924dd 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
 from typing import Optional
 
@@ -17,8 +18,8 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
     Shrink the given tensor and apply the given view to it.  This is
     used to resize the intermediate fused_moe caches.
     """
-    assert prod(
-        v) <= x.numel(), f"{prod(v)} <= {x.numel()}"  # CUDAGRAPH unfriendly?
+    assert prod(v) <= x.numel(
+    ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})"  # CUDAGRAPH unfriendly?
     return x.flatten()[:prod(v)].view(*v)
 
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index e8abd32ff6ba63e92e1020e8de7c4175521eeb9e..b3c65e34178ad96b2eb3b417ca6c093977189a0c 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom normalization layers."""
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 96659af408ed72eca239b22494d6f03689d6b5b3..978086d1909d1f559b6fc300130f83454e0b72ba 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from einops import rearrange
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 269ac043d26c41165505979b273685bb7b07a4af..588aa8deb183251b1e5d80968def42b592d900ca 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from abc import abstractmethod
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 6b69a260826b14fe1c2b81bf891914ffdbffb41b..3d01253447c030b8f0f94cf960f972bb48f2fa31 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that compute logits from hidden_stats."""
 import inspect
 from concurrent.futures import ThreadPoolExecutor
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index 019f634a9ef41ce4c866481dc074b7c1e89cd206..88053faf9e524cc4c0c7c87986a3c48718ce1d5f 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from dataclasses import dataclass
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 156e8752e96cfcc5ad05f8fb81f7e0729fb85fc4..118bd8d55c1d8682bdd4edbe66b5d14a9d92d8a9 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 from torch import nn
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index f94ab75f9a4f01decf744e4f763b6a2b431068c8..6d9ea5387879b517ffc89cf95b638a2410895ad3 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 21e27160f090b26401be7d6dbc922c5e2d50d183..a10c5ab697874d290b12d1e17a026bc3446de8c4 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 689c940d11ba407d01ffa89fd5a1ed2b7356692a..ccfb278cdff6c64c142766640721b3164f7cd2bc 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
index 0fdb055aab82fb74697e9ec0a0286a7f354151d4..11ca1255ebfb6ce633add657cf64bc0f9827361e 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 1652c51814cdff63dd3967fe56d5f88a7f6b0bf7..365e1c54b555a5f1920be406eee62cab8776c676 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
index ee633569097b6435b3cf7ad6eb727c030a69fd52..58bfb661d332a3140a61b5c0069872a0939a3271 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index 79a1663b85bbc607671a0616deb4f002706d5780..b121275e9eb386bc5c798c20077fe2db9d03075a 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index 6f69ca74389e98572272310a18684b66a7e9f642..a28fc9ffad71b5a4d42cc7bac70ff1c82860bb03 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 6abbc90819a82a9c1d5a175dfe44ead412693fa8..258038bed40bd57595d19c4d7bf5ba615a36a28a 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import IntEnum
 from typing import Optional, Union
@@ -6,10 +7,9 @@ from typing import Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import PretrainedConfig
 from typing_extensions import assert_never
 
-from vllm.config import PoolerConfig
+from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
@@ -283,30 +283,37 @@ class Pooler(nn.Module):
         )
 
 
-class CrossEncodingPooler(nn.Module):
-    """A layer that pools specific information from hidden states.
+class ClassifierPooler(nn.Module):
+    """A pooling layer for classification tasks.
 
     This layer does the following:
-    1. Extracts specific tokens or aggregates data based on pooling method.
-    2. Normalizes output if specified.
-    3. Returns structured results as `PoolerOutput`.
-
-    Attributes:
-        pooling_type: The type of pooling to use.
-        normalize: Whether to normalize the pooled data.
+    1. Applies a classification layer to the hidden states.
+    2. Optionally applies a pooler layer.
+    3. Applies an activation function to the output. In the case of
+       classification models it is either sigmoid or softmax. In the
+       case of scoring models, the same behavior is configuration
+       dependent, as in the sentence-transformers library.
     """
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: ModelConfig,
         classifier: nn.Module,
         pooler: Optional[nn.Module] = None,
     ):
         super().__init__()
         self.classifier = classifier
         self.pooler = pooler
-        self.default_activation_function = \
-            get_cross_encoder_activation_function(config)
+
+        if config.task == "score":
+            self.default_activation_function = \
+                get_cross_encoder_activation_function(config.hf_config)
+        elif config.task == "classify":
+            self.default_activation_function = nn.Sigmoid() \
+                if config.hf_config.num_labels == 1 else nn.Softmax()
+        else:
+            raise NotImplementedError(f"task={config.task!r} is not supported"
+                                      " with the classification pooler")
 
     def forward(
         self,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 407b9c72f41d8e4fb0e571e12bbe18a1f982e53b..1cb23e7a18875bc99f488dfa09e852b7262f312c 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Literal, get_args
 
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 8bf0ca5c0448a8058025553938cd9be3fcb9fa6c..2ea8c5dc51132d3d64a51969a6256b4f339f9700 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
 # and https://arxiv.org/pdf/2401.06118.pdf
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index 2d9f5e52bd65aec49ec48943206801e171adafaa..ea17cd56c9855156e86875218eb90242ce14fb11 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
 from typing import Any, Optional, Union
@@ -116,8 +117,9 @@ class AutoRoundConfig(QuantizationConfig):
 
         quantized = True
         if self.block_name_to_quantize:
-            quantized = any(name in layer_name
-                            for name in self.block_name_to_quantize)
+            quantized = any(
+                layer_name.startswith(name)
+                for name in self.block_name_to_quantize)
         elif isinstance(layer, ParallelLMHead):
             quantized = False
 
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 4660c28c8de4aac8494b02c22260da6606d30411..f8bc3ab5e7d1e1a6e23216e9d0d2b9a786d32383 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
@@ -101,7 +102,13 @@ class AWQLinearMethod(LinearMethodBase):
                        output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        if input_size_per_partition % self.quant_config.group_size != 0:
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
@@ -127,9 +134,11 @@ class AWQLinearMethod(LinearMethodBase):
             packed_factor=self.quant_config.pack_factor,
             weight_loader=weight_loader)
 
+        num_groups = input_size_per_partition // group_size
+
         qzeros = PackedvLLMParameter(
             data=torch.empty(
-                input_size_per_partition // self.quant_config.group_size,
+                num_groups,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
@@ -140,7 +149,7 @@ class AWQLinearMethod(LinearMethodBase):
             weight_loader=weight_loader)
 
         scales = GroupQuantScaleParameter(data=torch.empty(
-            input_size_per_partition // self.quant_config.group_size,
+            num_groups,
             output_size_per_partition,
             dtype=params_dtype,
         ),
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0c8d082bb428d88dc445831e5746792676a1e760..56d803c6baf12041e81bf123a9415c5b28f6c29f 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index 5e549157897927f2a46073fb9bd552baf7662f23..ebc526d6db2f9349938570ed91d6f874d07730dd 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index c9533da9d46ebbb754d5de84e090d5373aa26e44..78c5c75c06515bf0eead928fbcba2434cee01d15 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from abc import ABC, abstractmethod
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index 1cd12bb7631785103e78df2d09129f208b7c0a75..9e5ce39ec8f2e8c419c1aaa4653602d7bf1bf7e8 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 049ce7a7191da2bfa6f65fceea98ce342dc2faae..53ba84ea8e754cb39031f0bff40a47d2dc43a876 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
@@ -70,9 +71,7 @@ class BitsAndBytesConfig(QuantizationConfig):
 
     @staticmethod
     def get_config_filenames() -> list[str]:
-        return [
-            "adapter_config.json",
-        ]
+        return []
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig":
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 27547f315fef3a15f79b94c59fd7821b79492641..28c62fc5e58b6f3fdf1f32508767a1004a2cc6f1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
 from typing import Any, Literal, Optional, cast
@@ -23,10 +24,10 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
-    CompressedTensorsScheme, CompressedTensorsW4A16Fp4,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsScheme, CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
@@ -217,19 +218,39 @@ class CompressedTensorsConfig(QuantizationConfig):
         else:
             return False
 
+    def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
+
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_tensor_group_quant = (weight_quant.strategy
+                                 == QuantizationStrategy.TENSOR_GROUP.value
+                                 and input_quant.strategy
+                                 == QuantizationStrategy.TENSOR_GROUP.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        is_group_size_16 = (weight_quant.group_size == 16
+                            and input_quant.group_size == 16)
+        is_float_type = (weight_quant.type == QuantizationType.FLOAT
+                         and input_quant.type == QuantizationType.FLOAT.value)
+        is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
+
+        return (is_tensor_group_quant and is_float_type and is_4_bits
+                and is_group_size_16 and is_symmetric)
+
     def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
                          input_quant: BaseModel):
 
         is_weight_only = weight_quant is not None and input_quant is None
-        is_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.GROUP.value)
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
         is_symmetric = weight_quant.symmetric
 
         is_group_size_16 = weight_quant.group_size == 16
         is_float_type = weight_quant.type == QuantizationType.FLOAT
         is_4_bits = weight_quant.num_bits == 4
 
-        return (is_weight_only and is_group_quant and is_float_type
+        return (is_weight_only and is_tensor_group_quant and is_float_type
                 and is_4_bits and is_group_size_16 and is_symmetric)
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
@@ -352,6 +373,9 @@ class CompressedTensorsConfig(QuantizationConfig):
                     actorder=weight_quant.actorder)
 
         if is_activation_quantization_format(self.quant_format):
+            if self._is_fp4a4_nvfp4(weight_quant, input_quant):
+                return CompressedTensorsW4A4Fp4()
+
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
                     CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 9241ceeb4db29c727a611eec2e7bd662786d503c..bc9d399cf135b38152979e06bc82a3b408042da9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
+import importlib
 from enum import Enum
 from typing import Callable, Optional
 
@@ -10,7 +12,6 @@ from compressed_tensors.quantization import (ActivationOrdering,
                                              QuantizationStrategy)
 
 import vllm.envs as envs
-import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
@@ -29,6 +30,15 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+
+if current_platform.is_cuda_alike():
+    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+        BatchedPrepareAndFinalize)
+    if has_pplx:
+        from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+            PplxPrepareAndFinalize)
+
 logger = init_logger(__name__)
 
 
@@ -76,8 +86,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
-        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              and layer.activation == "silu"):
+        elif quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
@@ -420,6 +429,11 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
                 "For FP8 Fused MoE layer, we require either per tensor or "
                 "channelwise, dynamic per token quantization.")
 
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_fp8)
+        self.fused_experts = cutlass_moe_fp8  # type: ignore
+        self.disable_expert_map = False
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -498,25 +512,6 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-        device = w13_weight.device
-        # TODO strides can be shared across multiple layers
-        self.ab_strides1 = torch.full((num_experts, ),
-                                      hidden_size,
-                                      device=device,
-                                      dtype=torch.int64)
-        self.c_strides1 = torch.full((num_experts, ),
-                                     2 * intermediate_size_per_partition,
-                                     device=device,
-                                     dtype=torch.int64)
-        self.ab_strides2 = torch.full((num_experts, ),
-                                      intermediate_size_per_partition,
-                                      device=device,
-                                      dtype=torch.int64)
-        self.c_strides2 = torch.full((num_experts, ),
-                                     hidden_size,
-                                     device=device,
-                                     dtype=torch.int64)
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
@@ -557,6 +552,27 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+    def select_gemm_impl(self, prepare_finalize, moe):
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            CutlassExpertsFp8)
+
+        assert moe is not None
+
+        max_experts_per_worker = (
+            (moe.num_experts + prepare_finalize.world_size - 1) //
+            prepare_finalize.world_size)
+        experts = CutlassExpertsFp8(
+            max_experts_per_worker, moe.in_dtype,
+            self.input_quant.strategy == QuantizationStrategy.TOKEN,
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+        if has_pplx and isinstance(
+                prepare_finalize,
+            (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+            # no expert_map support in this case
+            self.disable_expert_map = True
+        return experts
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -576,9 +592,6 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         activation: str = "silu",
     ) -> torch.Tensor:
 
-        assert activation == "silu", (
-            f"{activation} not supported for Cutlass MoE.")
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -589,27 +602,22 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
-
-        from vllm.model_executor.layers.fused_moe import cutlass_moe_fp8
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=torch.uint32)
 
-        return cutlass_moe_fp8(
+        return self.fused_experts(
             x,
-            layer.w13_weight.transpose(1, 2),
-            layer.w2_weight.transpose(1, 2),
-            layer.w13_weight_scale,
-            layer.w2_weight_scale,
+            layer.w13_weight,
+            layer.w2_weight,
             topk_weights,
             topk_ids,
-            self.ab_strides1,
-            self.c_strides1,
-            self.ab_strides2,
-            self.c_strides2,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=None if self.disable_expert_map else expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            out_dtype=x.dtype,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 79bf5c108ac2ee41c5501cb9468db43207a3d2eb..6e4e75df76043c3850e17be9dbdf519e9d211641 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4
@@ -17,5 +19,6 @@ __all__ = [
     "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
     "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
-    "CompressedTensors24", "CompressedTensorsW4A16Fp4"
+    "CompressedTensors24", "CompressedTensorsW4A16Fp4",
+    "CompressedTensorsW4A4Fp4"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index f010bc03418c31b1b3c797c99dbb6bfd9017b9aa..30ed55aee04f802e913eae05d764e39504d9a062 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index daa25d23a3060a5daa527d8c4b8c669043afcdf5..a5d48f23567446e7f7d807e66fc04d0ab05decab 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 6ea31e50caa726287e784efb988edceafb51d9a2..3f3e7668fcf74e45956a28ea631d7afdeb75b129 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
index cf60b34ba78a9308784d77059dfac623b2d6cabf..8202ce95149690d78e9a1bf7ade16619d83e6d01 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Callable, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
new file mode 100644
index 0000000000000000000000000000000000000000..9899db3243a4d99243cd1ab79c994e1122e1cee0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._custom_ops import (cutlass_scaled_fp4_mm,
+                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+    dequantize_to_dtype, ref_nvfp4_quant)
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsW4A4Fp4"]
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
+
+
+class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
+
+    def __init__(self):
+        self.group_size = 16
+        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
+        if not self.cutlass_nvfp4_supported:
+            logger.warning("Current platform does not support cutlass NVFP4."
+                           " Running emulations.")
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # dont restrict as emulations
+        return 80
+
+    def run_nvfp4_emulations(self, x: torch.Tensor, layer):
+        x_m, x_k = x.shape
+        output_dtype = x.dtype
+
+        # quantize input to (FP4 and interleaved block scale)
+        x_fp4, x_blockscale = ref_nvfp4_quant(x, layer.input_global_scale,
+                                              self.group_size)
+
+        # dequantize input
+        x_fp4 = x_fp4.reshape(x_m, x_k // self.group_size, self.group_size)
+        x_blockscale = x_blockscale.unsqueeze(-1) / layer.input_global_scale
+        x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
+        del x_fp4, x_blockscale
+
+        # dequantize weight
+        w_fp4 = layer.weight.data.view(torch.uint8)
+        w_blockscale = layer.weight_scale_swizzled.data
+        w_global_scale = layer.weight_global_scale
+        w_dq = dequantize_to_dtype(w_fp4, w_blockscale, w_global_scale,
+                                   output_dtype, x.device, self.group_size)
+
+        # matmul
+        out = torch.matmul(x_dq, w_dq.t())
+        del w_dq, x_dq
+        return out
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // 2,
+            dtype=torch.uint8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight_packed", weight)
+
+        # Global Weight Scale
+        weight_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("weight_global_scale", weight_global_scale)
+
+        # Per Group Weight Scale
+        weight_scale = GroupQuantScaleParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // self.group_size,
+            dtype=torch.float8_e4m3fn,
+        ),
+                                                input_dim=1,
+                                                output_dim=0,
+                                                weight_loader=weight_loader)
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+        input_global_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader)
+        layer.register_parameter("input_global_scale", input_global_scale)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer) -> None:
+
+        global_input_scale = layer.input_global_scale.max().to(torch.float32)
+        layer.input_global_scale = Parameter(global_input_scale,
+                                             requires_grad=False)
+
+        layer.weight_global_scale = Parameter(
+            layer.weight_global_scale.max().to(torch.float32),
+            requires_grad=False)
+
+        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                requires_grad=False)
+
+        # required by cutlass kernel; need Parameter, not ModelWeightParameter
+        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+
+        if self.cutlass_nvfp4_supported:
+            layer.alpha = Parameter(layer.input_global_scale *
+                                    layer.weight_global_scale,
+                                    requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.cutlass_nvfp4_supported:
+            output_dtype = x.dtype
+            output_shape = [x.shape[0], layer.weight.shape[0]]
+
+            # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+            x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
+
+            out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
+                                        layer.weight_scale_swizzled,
+                                        1 / layer.alpha, output_dtype)
+            if bias is not None:
+                out = out + bias
+            return out.view(*output_shape)
+        return self.run_nvfp4_emulations(x, layer)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 61e4918ca47f2f16ae8e256986fe7de887b6b78c..01a87a088899679abc4f1f633a955ad3a86caacf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 99bb73b71e9f4c9eccd2e0384d425f920d581ed4..1e61e058cb84c8e3e9b5d8de55e9ed14223c5ae3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 7792ce86553c6131b983daf61b43df576cc0e612..6189f0609d85d19ec94151cb6220f757a78bb5b3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a33c58acb045cb4e55a6d3bd46f54e0cc58982d6..74787603e00295a8bf3561425473e004a9fd36f4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 2380d35702c617cfb50b7f99578fb56ee226eac1..9bcf1aa2bc1cd01c05e0a72eefd217dc93981774 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 75e81c4dd49d84d6b168cf2cb8c31a883a5e1c87..099d8613fc1a73db206d3e8fc7056ac2d5d1bf1f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
@@ -14,6 +15,7 @@ def is_activation_quantization_format(format: str) -> bool:
         CompressionFormat.naive_quantized.value,
         CompressionFormat.int_quantized.value,
         CompressionFormat.float_quantized.value,
+        CompressionFormat.nvfp4_pack_quantized.value
     ]
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 0c1eaff93e8b1764962c046203d6c7b67383be27..8030be52594452d4897fc2f7fead7b2b1e6213a0 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 3601d219df3b5f647346c1a515447636c51e90f4..01b0064f08058ede4ff51b3de68e52ed19ad70d7 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 223682ee97650a8ca75faaebcd3a46890109e307..3e465ee2cdd21abf1d4f5c521f836f21a1597558 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ac9b74945e0ce8011e5d4e3858a971283a6ea609..c785e0d1674daafe9809841bb62897f233bc1799 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 import importlib.util
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -451,6 +452,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         if envs.VLLM_USE_DEEP_GEMM:
             if not has_deep_gemm:
                 logger.warning_once("Failed to import DeepGemm kernels.")
+            elif not self.block_quant:
+                logger.warning_once("Model is not block quantized. Not using "
+                                    " DeepGemm kernels")
             elif (current_platform.is_cuda()
                   and current_platform.has_device_capability(90)):
                 logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
@@ -459,8 +463,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 logger.warning_once(
                     "DeepGemm not supported on the current platform.")
 
+        self.topk_indices_dtype = None
         self.fused_experts = functools.partial(  # type: ignore
             fused_experts,
+            use_fp8_w8a8=True,
             block_shape=self.quant_config.weight_block_size,
             allow_deep_gemm=self.allow_deep_gemm)
 
@@ -763,19 +769,42 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def select_gemm_impl(self, prepare_finalize):
+    def select_gemm_impl(self, prepare_finalize, moe):
+
+        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+            BatchedTritonOrDeepGemmExperts)
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts)
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")
 
-        experts = TritonOrDeepGemmExperts(
-            use_fp8_w8a8=True,
-            block_shape=self.quant_config.weight_block_size,
-            allow_deep_gemm=self.allow_deep_gemm,
-        )
+        experts: Optional[Union[BatchedTritonOrDeepGemmExperts,
+                                TritonOrDeepGemmExperts]] = None
+        max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+        use_batched_experts = max_num_tokens_per_rank is not None
 
+        if use_batched_experts:
+            experts = BatchedTritonOrDeepGemmExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                world_size=prepare_finalize.world_size,
+                dp_size=prepare_finalize.dp_size,
+                use_fp8_w8a8=True,
+                use_int8_w8a8=False,
+                use_int8_w8a16=False,
+                use_int4_w4a16=False,
+                per_channel_quant=False,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+        else:
+            experts = TritonOrDeepGemmExperts(
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+
+        assert experts is not None
         return experts
 
     def apply(
@@ -796,6 +825,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -807,6 +837,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -854,7 +885,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_ids=topk_ids,
                 inplace=True,
                 activation=activation,
-                use_fp8_w8a8=True,
                 global_num_experts=global_num_experts,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 expert_map=expert_map,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 1fcb6d7afc9b382ed31dc52920e3fcb78e35dec1..2171f729afad11897933488e73163b348ae26781 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 436f1e3ccc1a52ac9a5e0e83824b7c324c36b47d..d3ab1be3bee019ff9f4245c9487ad3733e929c4c 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from enum import Enum
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index be9510abdffb3b68740f17f0cd848fc1e12f77e8..78e0f59fa4bee365600948061a82f2a4bb280cd4 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index cf012e145ee68e857e8c3ffe60c4f03b86735300..f92ebdea986da9ae8488442674fd7f64191f1bac 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index e90416f377915bc44539bc59433aee7b27d320cb..eba917d8541181919cd1fb9e51550de530ccfe3d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index a8faf97723cd134c0202ae30d1c40c9625290d17..ee8a0e34b32e5536500d82eb5f53f93e894f2edd 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 8108c797637d43d7bed30dbbd8c7c62176d86d6b..31ad96eccaf3eac27b015b438f964409d4cab469 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 55ad00b1cf461aae92ba9ff2c59215b42758882a..07ecc096231a427942899117c428c928301170c4 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index bb1dc40ad71a70dc6467846ddc3c39598bd84365..0bf0d530d2351fb015104e5649021b2a75fd4a95 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
index e07177dd675fe6fa1a996145898e1cc3c1630375..785e559df8f75070727e27c00a7364311f170569 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
index 29e20699184c56e2461379df085be1ca6d60b991..649d07b4d0723bf0f586f030c649cf31fd85d558 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index 50d293cf415bf154bb0cfb6ba68cf01c0e4d2a06..fef333e862d5a85d095b3e8a6f2683dadca0f820 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 855867fa4a00683bbabe45d975b2f6294f5e50aa..c7c45861875aff628aecd56aa0e3adcc9f274a0f 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 899011f000515d26c9700a49806081a6f45bd531..1597492a5cf651b0f4c7a2e8c69abbbe21a540ed 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 2d92af74bbf9a97b4db9d830720e9aa3766cf0ff..9ebf5f30379221fc3fbcb8c6487a3eefcc94e7a8 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 5d58c0489a286af1db19665437701fc06c2ed5b9..18f5ce04fd355e43a7b1727c94d38ab66f11dcb0 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 6c2c464e6f1b344526b7f965bfbe5c54428bdb1d..165548a060128e728982481dbb1c2e0937dfd087 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 98a0b30be1f626c03f77c5f3d732f65ffc6b1344..6ddd4a9ec42339d2210c674f2654f5aabf8ee0d4 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index c09ca83d01cbbb4df5840b73880feb378d6022eb..817565cf2827753d067d242967c84de5649e9101 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index a97b53b9d7b9573b679f59a3a1e4ccb0938190aa..3de28af40aaa593b8b0a6f5726d0d08419eddfff 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 67723c7c91cc5db4e0e573c7ce355a643cc72301..e5604670fb4c141eaba672217888830ad8adc50d 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 2437030c8771733ebd42e2751f07fbcd6c7f6c0d..62667db26b669af7dabdc92406d412812e9ceafa 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 1c5680f952ab5634f9fe3ac80e09ddb059762d92..3f79b203aa170824463a045ba80c4fdba6863d5a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional, Union
 
@@ -585,9 +586,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # GEMM 1
-        assert torch.allclose(
-            layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]), (
-                "w1_weight_scale_2 must match w3_weight_scale_2")
+        if not torch.allclose(layer.w13_weight_scale_2[:, 0],
+                              layer.w13_weight_scale_2[:, 1]):
+            logger.warning_once(
+                "w1_weight_scale_2 must match w3_weight_scale_2. "
+                "Accuracy may be affected.")
 
         w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 74bd6dc13f84aade97fcd75365ed6c4375c42684..3aa23f068257637f06999d061d606b9030608f2c 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index 38b374feea81dab78fe9d2cd9a5618c296715e16..8040236663dd1eadf924de928c643e4b405ef222 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from importlib.util import find_spec
@@ -13,6 +14,12 @@ from vllm.model_executor.layers.quantization.base_config import (
 SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
 
 
+class AlwaysSupportedDtypes(list):
+
+    def __contains__(self, item):
+        return True
+
+
 class NeuronQuantConfig(QuantizationConfig):
     """Int8 Quantization Config class for Neuron Backend."""
 
@@ -35,7 +42,8 @@ class NeuronQuantConfig(QuantizationConfig):
         return "neuron_quant"
 
     def get_supported_act_dtypes(self) -> list[str]:
-        return SUPPORTED_QUANT_DTYPE_LIST
+        # Neuron implements custom handling logic for quantization support
+        return AlwaysSupportedDtypes()
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 9e4fb33639b21f039d3056c80a2c76a7796e6c3b..32ba1055f9c834d2b97ec9aebb33ca936fb0d933 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 6028b8a2ada3b49bf738754f672a9444479c4fbb..25978cb13b3ab6b6aedc2b5f979b4763d35e75dd 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index df4bfbbbcb4c07a1663988923a726adb4ccb4183..6ae5f5c9ad46bc2fdcb4ad74e14d0d0969ef11a3 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
 from typing import Any, Optional, cast
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index aa7d725433eafaf21ca360132d0996b3e23cf336..4c2da4c8b04eedf68806f6b7f9dfa244b2bb1804 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
index d7dac17574ffeb6139151814249a8f275e1f978e..ec09d9b2ac26f7c2e96ca9dc06b6f0085911ed67 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .quark_scheme import QuarkScheme
 from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index 40c8ea86d3c385417f7810c774b5ebe85baf3a02..c167e949ac262df1eab312116a2d5bd63e11c3dc 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
index 34c077b29163acfeedfa1b4893a33aabb5b5c6f8..3c56251b7a0093abf10982f64f213d5aadb1e715 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 149c9093797f29aab331d289cf32ed8ba5828075..47e0a492b23b99eef3e4e7c5a8e4187788364138 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 94f9fcd56acac64371408c2330fc75083f023160..ae68d5bbc26800a485a83bb3e7523713470dc9d3 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 5e56bcb7564cdb693bd08b423e4b633e079cdcf6..99f5ec15933abc95382ebd8243e85059f64beb1b 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index c0be40c16affca308783534c99a8fe928674a742..a108152929d9a7fd630db657b82a4c545c1264fa 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the Pydantic schemas for various quantization-related
 parameters. When a relevant quantization technique is specified, these
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 7f9f3e643bfa2e0fd107de5e825a330475ea82fb..a7d9332032a286f87a564c1319c423887551e308 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -12,12 +14,28 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 
+logger = init_logger(__name__)
+
 
 class TorchAOConfig(QuantizationConfig):
     """Config class for torchao."""
 
     def __init__(self, torchao_config) -> None:
         self.torchao_config = torchao_config
+        """
+        # TorchAO quantization relies on tensor subclasses. In order,
+        # to enable proper caching this needs standalone compile
+        if is_torch_equal_or_newer("2.8.0"):
+            os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1"
+            logger.info(
+                "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1")
+
+        # TODO: remove after the torch dependency is updated to 2.8
+        if is_torch_equal_or_newer(
+                "2.7.0") and not is_torch_equal_or_newer("2.8.0"):
+            os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
+            logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1")
+        """
 
     def __repr__(self) -> str:
         return f"TorchAOConfig({self.torchao_config})"
@@ -60,10 +78,10 @@ class TorchAOConfig(QuantizationConfig):
         if not isinstance(layer, LinearBase):
             return None
 
-        from torchao.quantization import AOPerModuleConfig
+        from torchao.quantization import ModuleFqnToConfig
 
         module_fqn = prefix
-        if isinstance(self.torchao_config, AOPerModuleConfig):
+        if isinstance(self.torchao_config, ModuleFqnToConfig):
             module_fqn_to_config = self.torchao_config.module_fqn_to_config
             c = module_fqn_to_config.get(
                 module_fqn) or module_fqn_to_config.get("_default", None)
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 7941ec9732fed8b91a9d7733b74408f6a503ad59..83c8a98eac91321a25b128206ebf656ee39be772 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
index f7ee4728851408d2fe796b3910e04d270ba0faeb..6ad56bae3dca0cfd3acd380d16d20583153e1330 100644
--- a/vllm/model_executor/layers/quantization/utils/__init__.py
+++ b/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .layer_utils import replace_parameter, update_tensor_inplace
 
diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
index 97860765a9e1431840b7856bbc676f1b86f9f815..1992b4d201478ca1062f8d6ac4867c8968985dd8 100644
--- a/vllm/model_executor/layers/quantization/utils/allspark_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
index 70d24cc897e107228e6945a5dbbbeafcb7bbfdb3..82ee3edfd5e19a6d5fd9b5fd31aea61cb22f03c4 100644
--- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 4c213f2c874ea2ccc445dd19c1cbcdec658cf93c..08dc99e07597bb8f7cda6ad94ef96e66b0aacd59 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import functools
@@ -135,24 +136,10 @@ def apply_w8a8_block_fp8_linear(
         use_cutlass, use_aiter_and_is_supported)
 
     if use_cutlass:
-        rows, cols = input_2d.shape
-        # Blackwell GPUs (SM100) require row dimensions to be multiple of 4 for
-        # optimal tensor core usage. Can be removed when targeting platforms
-        # without this constraint.
-        should_pad = current_platform.has_device_capability(
-            100) and rows % 4 != 0
-        if should_pad:
-            input_2d = torch.nn.functional.pad(input_2d,
-                                               (0, 0, 0, 4 - (rows % 4)),
-                                               value=0).contiguous()
-
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=use_cutlass)
-
         output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                       block_size, input.dtype)
-        if should_pad:
-            output = output[:rows, :]
 
     else:
         q_input, x_scale = per_token_group_quant_fp8(
@@ -247,8 +234,13 @@ def _per_token_group_quant_fp8(
     row = g_id // groups_per_row
     row_g_id = g_id % groups_per_row
 
-    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
-    y_q_ptr += g_id * group_size
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
     y_s_ptr += g_id
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
@@ -295,15 +287,23 @@ def _per_token_group_quant_fp8_colmajor(
     row = g_id // groups_per_row
     row_g_id = g_id % groups_per_row
 
-    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
-    y_q_ptr += g_id * group_size
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
 
     # Convert g_id the flattened block coordinate to 2D so we can index
     # into the output y_scales matrix
     blocks_per_row = y_num_columns // group_size
     scale_col = g_id % blocks_per_row
     scale_row = g_id // blocks_per_row
-    y_s_ptr += scale_col * y_s_col_stride + scale_row
+    # Ensure offset calculation uses int64 for y_s_ptr
+    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
+        tl.int64)
+    y_s_ptr += y_s_ptr_offset
 
     cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
     mask = cols < group_size
@@ -324,6 +324,7 @@ def per_token_group_quant_fp8(
     eps: float = 1e-10,
     dtype: Optional[torch.dtype] = None,
     column_major_scales: bool = False,
+    out_q: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -334,6 +335,8 @@ def per_token_group_quant_fp8(
         eps: The minimum to avoid dividing zero.
         dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
         is supported for now.
+        column_major_scales: Outputs scales in column major.
+        out_q: Optional output tensor. If not provided, function will create.
     Returns:
         tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
@@ -348,7 +351,11 @@ def per_token_group_quant_fp8(
     fp8_min = finfo.min
     fp8_max = finfo.max
 
-    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    assert out_q is None or out_q.shape == x.shape
+    x_q = out_q
+    if x_q is None:
+        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+
     M = x.numel() // group_size
     N = group_size
     if column_major_scales:
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 36161d13b24f88b241f2779bc28d3aa65512e703..db82b0def1653c60d436c530be4b37fab30b5d92 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 72fff3fa1aed1d6c423fa155cfeba94f0589e95d..a694a191745d889ffb2fd4289da4fa3e4c9de347 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py
 import functools
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index 5acae7ca3b84f2047608a7688a3271a37d81331f..fbc0f23acb59ae6769ffb8925954d1fb38754f09 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Union
 
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 6d840b5686123e57a003d2362b5c61b0fcca9df7..580c36a0e2fa8a6279b9aeed766bb71efea9a920 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index e059a7ac3f9266691f78280e505decb7ed18e2ea..7540a1516fcb082c826c97762d584be4c06d0ad7 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 15177af58ae6e3ff8d442e1feeab983698070e2d..ca10db69dc168a0e3260811510d5901a68304d80 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
@@ -22,7 +23,12 @@ def is_fp4_marlin_supported():
 
 
 def fp4_marlin_process_scales(marlin_scales):
-    assert (marlin_scales >= 0).all()
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
 
     # convert to half first, we would convert to fp8 later
     marlin_scales = marlin_scales.to(torch.half)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 1f6e74244c5d489bd3039307bdd8bf6b8ffbce7e..5372c49d9838bca818fb2dae1148d482e286b5e9 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 81112b27f53a8761a76db14a7cc7539464b16e69..b2c228c242532af12a6ad5f5a068c963ce491615 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 73feb4264a8bb529b6a3905fe50b591abeef0498..1c93c364679dae9d92da80ec5e7d5c19b2250657 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
 import random
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
index 0123540fc5ddd43b41dca160714b7abb4c915935..8a64bebae04c9b52430d6c4f76dfe79458989c9a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index e7c95e38e9fd1c16014d9ec2130dc7936af83b9e..9d4a188f52dfcc63f7400a7f585bbcc8e902fa2d 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index f292208311e25c6b6879175c36aad997bcaeeb28..c4ef3ce24c03add37ea2c2089043ab43bc71a7ab 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
-__all__ = [
-    "break_fp4_bytes",
-    "dequantize_to_dtype",
-]
+from vllm.scalar_type import scalar_types
+
+__all__ = ["break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant"]
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 
 kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
                             dtype=torch.float32)
@@ -59,3 +61,44 @@ def dequantize_to_dtype(tensor_fp4,
     # scale the tensor
     out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
     return out.to(dtype)
+
+
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+
+
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+
+
+def ref_nvfp4_quant(x, global_scale, block_size):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // block_size, block_size))
+    vec_max = torch.max(torch.abs(x), dim=-1,
+                        keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = torch.clamp(scale, max=448, min=-448)
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    # both outputs are float32
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 6ba327f3db7a45b0cf9f526b2f844f3d56871d59..d6b96774b4e8bb13c83cc7df09b83c286ada270e 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
 from collections.abc import Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 4b041cff2eccba85aa5fc3f2f1511275cc74848b..adc67aa64952dba5e9e2b0cfe9fd41dd27e691b7 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional, Union
 
@@ -155,8 +156,8 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                    scale_b: torch.Tensor, bias: torch.Tensor,
                                    input_2d: torch.Tensor,
                                    output_shape: list) -> torch.Tensor:
-    from vllm.platforms.rocm import on_mi250_mi300
-    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300(
+    from vllm.platforms.rocm import on_mi3xx
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx(
     ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
         output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
                                current_platform.get_cu_count())
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3db73495827c688798df8a6e5a02ca7a0ef8717f..a6e58a77d42cd715b762d317da2339507f4158a2 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import cached_property
 from importlib.util import find_spec
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index 839688e313aaec7171ba990dbb16f8d2877b5276..3f2d571777c0098c05a7be6fad428626d7365eda 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 70463ecd90ae7e2c100df7c99534a033739e697e..9de2338968a1c3ee21ca64713fb721f96b898c34 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
@@ -96,7 +97,7 @@ class RotaryEmbedding(CustomOp):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
     ) -> None:
@@ -113,7 +114,7 @@ class RotaryEmbedding(CustomOp):
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         # NOTE(woosuk): To exactly match the HF implementation, we need to
         # use CPU to compute the cache and then move it to GPU. However, we
@@ -404,7 +405,7 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factors: Union[list[float], float],
         dtype: torch.dtype,
@@ -464,7 +465,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding):
                  head_size: int,
                  rotary_dim: int,
                  max_position_embeddings: int,
-                 base: int,
+                 base: float,
                  is_neox_style: bool,
                  scaling_factor: float,
                  dtype: torch.dtype,
@@ -474,7 +475,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding):
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
         inv_freq = super()._compute_inv_freq(base)
 
@@ -501,7 +502,7 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
@@ -582,7 +583,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
@@ -644,7 +645,7 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         rotary_dim: int,
         max_position_embeddings: int,
         original_max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         short_factor: list[float],
@@ -769,7 +770,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
@@ -877,7 +878,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         scaling_factor: float,
@@ -892,7 +893,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         inv_freqs = super()._compute_inv_freq(base)
         low_freq_wavelen = self.orig_max_position / self.low_freq_factor
         high_freq_wavelen = self.orig_max_position / self.high_freq_factor
@@ -923,14 +924,14 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
     ):
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         inv_freqs = super()._compute_inv_freq(base)
         inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
         return inv_freqs
@@ -989,7 +990,7 @@ class MRotaryEmbedding(RotaryEmbedding):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         mrope_section: Optional[list[int]] = None,
@@ -1529,7 +1530,7 @@ class DualChunkRotaryEmbedding(CustomOp):
         head_size: int,
         rotary_dim: int,
         max_position_embeddings: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
         chunk_size: int,
@@ -1558,7 +1559,7 @@ class DualChunkRotaryEmbedding(CustomOp):
                              q_inter_cache,
                              persistent=False)
 
-    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
         # However, we use `torch.arange(..., dtype=torch.float)` instead to
@@ -1705,7 +1706,7 @@ def get_rope(
     head_size: int,
     rotary_dim: int,
     max_position: int,
-    base: int,
+    base: float,
     is_neox_style: bool = True,
     rope_scaling: Optional[dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 32375db0c8f1a36ab216ecc36ce4f7de91c3d9e6..08840fc40cf6afa4dc906a25b59fba65acc3e9d6 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 from collections.abc import Iterator
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 969cd59b57ccc541dfd49aa171b3f9d409c7bfac..0a36fe9be45b16148c64466528a31f0d5ba1f658 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from typing import Optional, Union
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index a14c86148e730c909db284d301b484089d44534a..5dabaa5379e7bd6732374cf4b705f4f44dfc4a44 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.jit
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 18783d0d77856908a9e01cdb087fd9e750848c87..41b5253dca04893d3fe7d90cbb73eaa2ea9f6cc8 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
 from typing import Callable, Optional
 
@@ -49,16 +50,11 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                                                    vocab_size, num_seqs)
     output_bin_counts, output_mask = get_token_bin_counts_and_mask(
         output_tokens_tensor, vocab_size, num_seqs)
-    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
-        1, vocab_size)
 
-    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
-                            1.0)
-
-    # If logits are positive, divide by penalty, otherwise multiply by penalty.
-    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
-    logits *= scaling
+    # Apply repetition penalties as a custom op
+    from vllm._custom_ops import apply_repetition_penalties
+    apply_repetition_penalties(logits, prompt_mask, output_mask,
+                               repetition_penalties)
 
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
@@ -70,9 +66,9 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
 def rocm_unquantized_gemm(x: torch.Tensor,
                           weight: torch.Tensor,
                           bias: Optional[torch.Tensor] = None):
-    from vllm.platforms.rocm import on_mi250_mi300
+    from vllm.platforms.rocm import on_gfx9
     k = weight.shape[1]
-    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300() and \
+    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \
                     x.dtype in [torch.float16, torch.bfloat16] \
                     and k % 8 == 0 and bias is None)
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 46d2075af99daac79de4b6250428aa9001f80502..0f636d83a6dd98e52a81836fcfe6f104740b8de4 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from dataclasses import dataclass
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index a443a652d8a3f0e947f44367c98317f7dd5f4ae6..f364371033f5377718e94ddb20d13171e98013f0 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index 010dd515784af3629b6945e7287f017e52f2c67c..5018c7d9a360b0069362903cf9f6ce53eed35fab 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -1,9 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 
+import torch
 import torch.nn as nn
 
 from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 
 
 class BaseModelLoader(ABC):
@@ -18,7 +22,22 @@ class BaseModelLoader(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def load_model(self, *, vllm_config: VllmConfig,
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model. This standalone API allows 
+        inplace weights loading for an already-initialized model"""
+        raise NotImplementedError
+
+    def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         """Load a model with the given configurations."""
-        raise NotImplementedError
+        device_config = vllm_config.device_config
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config,
+                                         model_config=model_config)
+            # Quantization does not happen in `load_weights` but after it
+            self.load_weights(model, model_config)
+            process_weights_after_loading(model, model_config, target_device)
+        return model.eval()
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 0d83c8d53419902f1ce898e81faef874c338f170..068a4e355ff8d1cd13d98aa3d32dc561f8471036 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
-import copy
 import fnmatch
 import glob
 import itertools
@@ -15,7 +15,7 @@ from huggingface_hub import HfApi
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 # yapf: enable
@@ -29,14 +29,14 @@ from vllm.model_executor.layers.linear import (LinearBase,
                                                RowParallelLinear)
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (ParamMapping,
-                                                    initialize_model,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models import is_pooling_model
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import (get_packed_modules_mapping,
+                                       set_weight_attrs)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -392,7 +392,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
 
         for name, module in model.named_modules():
-            if isinstance(module, (LinearBase, )):
+            if (isinstance(module, LinearBase) and
+                    hasattr(module.quant_method, "quant_config")):
                 if modules_info := self.modules_mapping.get_sub_modules(name):
                     # Map vllm's names to transformers's names.
                     rep_name, sub_modules = modules_info
@@ -408,8 +409,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"
 
-    def _load_weights(self, model_config: ModelConfig,
-                      model: nn.Module) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         if not hasattr(model, "load_weights"):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
@@ -420,8 +420,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet. No 'packed_modules_mapping' found.")
         self.is_pool_model=is_pooling_model(model)
-        self.modules_mapping = ParamMapping(
-            copy.deepcopy(model.packed_modules_mapping))
+
+        self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
 
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
@@ -568,15 +568,3 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
-
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-
-                model = initialize_model(vllm_config=vllm_config)
-
-                self._load_weights(model_config, model)
-
-        return model.eval()
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 29a6e0af4bc677ad7f104ba9b308f887c7a8e91f..4624ff01ddc03b311364f1ad7f5719bae000df2d 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 import glob
 import os
@@ -12,11 +13,9 @@ from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm import envs
-from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, LoadFormat, ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
@@ -264,32 +263,20 @@ class DefaultModelLoader(BaseModelLoader):
                               fall_back_to_pt=True,
                               allow_patterns_overrides=None)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config,
-                                         model_config=model_config)
-
-            weights_to_load = {name for name, _ in model.named_parameters()}
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model))
-            self.counter_after_loading_weights = time.perf_counter()
-            logger.info(
-                "Loading weights took %.2f seconds",
-                self.counter_after_loading_weights -
-                self.counter_before_loading_weights)
-            # We only enable strict check for non-quantized models
-            # that have loaded weights tracking currently.
-            if model_config.quantization is None and loaded_weights is not None:
-                weights_not_loaded = weights_to_load - loaded_weights
-                if weights_not_loaded:
-                    raise ValueError(
-                        "Following weights were not initialized from "
-                        f"checkpoint: {weights_not_loaded}")
-
-            process_weights_after_loading(model, model_config, target_device)
-
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(
+            self.get_all_weights(model_config, model))
+        self.counter_after_loading_weights = time.perf_counter()
+        logger.info(
+            "Loading weights took %.2f seconds",
+            self.counter_after_loading_weights -
+            self.counter_before_loading_weights)
+        # We only enable strict check for non-quantized models
+        # that have loaded weights tracking currently.
+        if model_config.quantization is None and loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index 0e2f0be1ec26c98eb407b0a426a87dcf1d64ee32..f4a7da5744e04bfe1e67d215d739f347f27bbbd6 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
-import torch
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch.nn as nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     initialize_dummy_weights)
 
@@ -22,16 +20,8 @@ class DummyModelLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-            # NOTE(woosuk): For accurate performance evaluation, we assign
-            # random values to the weights.
-            initialize_dummy_weights(model)
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        # NOTE(woosuk): For accurate performance evaluation, we assign
+        # random values to the weights.
+        initialize_dummy_weights(model)
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 806004bf9604f1d3a9b14a0b58bf0561a28cd6b2..203c80760145a4b082355a70ebb5df5cb9ffadf5 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Generator
 
@@ -92,6 +93,13 @@ class GGUFModelLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        local_model_path = self._prepare_weights(model_config.model)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        model.load_weights(
+            self._get_weights_iterator(local_model_path, gguf_weights_map))
+
     def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
@@ -106,8 +114,7 @@ class GGUFModelLoader(BaseModelLoader):
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
                 model = initialize_model(vllm_config=vllm_config)
-            model.load_weights(
-                self._get_weights_iterator(local_model_path, gguf_weights_map))
+            self.load_weights(model, model_config)
 
             process_weights_after_loading(model, model_config, target_device)
         return model
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index e65d16cae76cb6195297dc6d1e9498417ab4b249..fad97aba84b6aaf9f69e0c5c968935e2b5315a45 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading Neuron models in transformers-neuronx
 framework."""
 import ast
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index 557feea46a907bdd623bc679871cb3c0381005c6..f450961c64ff4931b2ba193fbc713b821ed9815d 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading Neuron models in
 neuronx-distributed-inference framework."""
 # Disabling yapf because yapf and isort have conflicts for the below imports
@@ -17,6 +18,8 @@ from neuronx_distributed_inference.models.config import (
     FusedSpecNeuronConfig, OnDeviceSamplingConfig)
 from neuronx_distributed_inference.models.mllama.utils import (
     create_vision_mask)
+from neuronx_distributed_inference.modules.lora_serving import (
+    LoraServingConfig)
 from neuronx_distributed_inference.utils.hf_adapter import (
     load_pretrained_config)
 from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
@@ -80,25 +83,26 @@ class NeuronCausalLM(nn.Module):
         # Lazy initialized
         self.model: nn.Module
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-        sampling_params: torch.Tensor,
-    ) -> torch.Tensor:
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                input_block_ids: torch.Tensor,
+                sampling_params: torch.Tensor,
+                prev_hidden: Optional[torch.Tensor] = None,
+                adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
         # sort block ids sequentially for perf/neuron support reasons
         sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
         input_ids = torch.index_select(input_ids, 0, sorted_indices)
         positions = torch.index_select(positions, 0, sorted_indices)
         sampling_params = torch.index_select(sampling_params, 0,
                                              sorted_indices)
-
         output = self.model(input_ids,
                             attention_mask=None,
                             position_ids=positions,
                             seq_ids=sorted_input_block_ids,
-                            sampling_params=sampling_params)
+                            sampling_params=sampling_params,
+                            prev_hidden=prev_hidden,
+                            adapter_ids=adapter_ids)
         # on-device sampling
         if self.config.neuron_config.on_device_sampling_config:
             output = output.hidden_states
@@ -201,6 +205,11 @@ class NeuronMllamaForCausalLM(nn.Module):
                  config: PretrainedConfig,
                  on_device_sampling_disabled: bool = False) -> None:
         super().__init__()
+        # has_image is the only multimodal input that is used in
+        # token-generation
+        # This is a cache (on CPU) that saves has_image data per sequence id
+        # The number of entries in this cache is <= Batch-Size
+        self.has_image_cache: dict[int, torch.Tensor] = {}
         self.config = config
         self.logits_processor = LogitsProcessor(
             config.get_text_config().vocab_size, logits_as_input=True)
@@ -212,11 +221,57 @@ class NeuronMllamaForCausalLM(nn.Module):
 
         # Lazy initialized
         self.model: nn.Module
+        self.is_reorder_needed: bool = True
+
+    def read_from_has_image_cache(self, seq_ids: torch.Tensor):
+        has_image_list = []
+        for index in range(len(seq_ids)):
+            seq_id = seq_ids[index].item()
+            if seq_id in self.has_image_cache:
+                has_image_list.append(self.has_image_cache[seq_id])
+            else:
+                has_image_list.append(torch.tensor([0]))
+        return torch.tensor(has_image_list)
+
+    def write_to_has_image_cache(self, seq_ids: torch.Tensor,
+                                 has_image: torch.Tensor):
+        for index in range(len(seq_ids)):
+            seq_id = seq_ids[index].item()
+            if index < len(has_image):
+                self.has_image_cache[seq_id] = has_image[index]
+            else:
+                self.has_image_cache[seq_id] = torch.zeros(1)
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 seq_ids: torch.Tensor, pixel_values: torch.Tensor,
                 aspect_ratios: torch.Tensor, num_chunks: torch.Tensor,
                 has_image: torch.Tensor, sampling_params) -> torch.Tensor:
+
+        # We update the has_image cache during prefill
+        # and read the has_image cache during decode
+        if input_ids.shape[-1] > 1:  # prefill
+            self.write_to_has_image_cache(seq_ids, has_image)
+        else:
+            has_image = self.read_from_has_image_cache(seq_ids)
+            bs = input_ids.shape[0]
+            num_chunks = torch.zeros((bs, 1))
+            aspect_ratios = torch.zeros((bs, 1, 2))
+
+        input_block_ids = seq_ids
+        origin_input_block_ids = seq_ids
+        if self.is_reorder_needed:
+            # sort block ids sequentially for perf/neuron support reasons
+            input_block_ids, sorted_indices = torch.sort(input_block_ids)
+            input_ids = torch.index_select(input_ids, 0, sorted_indices)
+            positions = torch.index_select(positions, 0, sorted_indices)
+            sampling_params = torch.index_select(sampling_params, 0,
+                                                 sorted_indices)
+            pixel_values = torch.index_select(pixel_values, 0, sorted_indices)
+            aspect_ratios = torch.index_select(aspect_ratios, 0,
+                                               sorted_indices)
+            num_chunks = torch.index_select(num_chunks, 0, sorted_indices)
+            has_image = torch.index_select(has_image, 0, sorted_indices)
+
         self.vision_mask = create_vision_mask(input_ids, self.vision_token_id)
         output = self.model(
             input_ids.to(torch.int32),
@@ -232,8 +287,14 @@ class NeuronMllamaForCausalLM(nn.Module):
             has_image=has_image.to(torch.int32),
         )
         if self.config.neuron_config.on_device_sampling_config:
-            return output.hidden_states
-        return output.logits[:, -1, :]
+            output = output.hidden_states
+        else:
+            output = output.logits[:, -1, :]
+
+        if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1:
+            restored_indices = torch.argsort(sorted_indices)
+            output = torch.index_select(output, 0, restored_indices)
+        return output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
@@ -296,7 +357,7 @@ class NeuronMllamaForCausalLM(nn.Module):
             self.model = neuronx_model_cls(compiled_model_path)
             tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
             self.vision_token_id = tokenizer(
-                "<|image|>", add_special_tokens=False).input_ids
+                "<|image|>", add_special_tokens=False).input_ids[0]
             self.model.load(compiled_model_path)
             return
         except (FileNotFoundError, ValueError):
@@ -323,7 +384,7 @@ class NeuronMllamaForCausalLM(nn.Module):
 
         # Read "<|image|>" token_id from the tokenizer
         self.vision_token_id = tokenizer("<|image|>",
-                                         add_special_tokens=False).input_ids
+                                         add_special_tokens=False).input_ids[0]
         logger.info("\nLoading model from compiled checkpoint...")
         self.model.load(compiled_model_path)
 
@@ -522,7 +583,8 @@ def _get_model_architecture(config: PretrainedConfig) -> str:
 
 def _get_default_neuron_config(model_config: ModelConfig,
                                parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig):
+                               scheduler_config: SchedulerConfig,
+                               lora_serving_config: LoraServingConfig):
     """Generate a neuron config based on vllm config args."""
     on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True,
                                                        deterministic=False)
@@ -541,7 +603,7 @@ def _get_default_neuron_config(model_config: ModelConfig,
         padding_side="right",
         on_device_sampling_config=on_device_sampling_config,
         sequence_parallel_enabled=True,
-    )
+        lora_serving_config=lora_serving_config)
     return neuron_config
 
 
@@ -581,7 +643,8 @@ def _get_neuron_config_after_override(default_neuron_config,
 
 def get_neuron_model(model_config: ModelConfig,
                      parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig) -> nn.Module:
+                     scheduler_config: SchedulerConfig,
+                     lora_serving_config: LoraServingConfig) -> nn.Module:
     """Initializes a neuron-optimized model for inference."""
     model_arch = _get_model_architecture(model_config.hf_config)
     if model_arch == "MllamaForConditionalGeneration":
@@ -589,7 +652,7 @@ def get_neuron_model(model_config: ModelConfig,
     else:
         model = NeuronCausalLM(model_config.hf_config)
     default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config)
+        model_config, parallel_config, scheduler_config, lora_serving_config)
     neuron_config = _get_neuron_config_after_override(
         default_neuron_config_args, model_config.override_neuron_config)
 
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index 9f1022c259251cd0783641a3c9921a06f8162eb7..83e0f386c10823d09e0878feed69557bb8004423 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import glob
 import os
@@ -9,10 +10,8 @@ import torch
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     runai_safetensors_weights_iterator)
@@ -100,21 +99,11 @@ class RunaiModelStreamerLoader(BaseModelLoader):
         """Download model if necessary"""
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        """Perform streaming of the model to destination"""
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-
-            model_weights = model_config.model
-            if hasattr(model_config, "model_weights"):
-                model_weights = model_config.model_weights
-            model.load_weights(
-                self._get_weights_iterator(model_weights,
-                                           model_config.revision))
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model."""
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        model.load_weights(
+            self._get_weights_iterator(model_weights, model_config.revision))
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index 78bca89f0015e66a9bd30f61bc808838cfd80a03..2fd9cfba3f61a094cb826c9214406bcc680e9c1e 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import collections
 import glob
@@ -9,11 +10,9 @@ from typing import Any, Optional
 import torch
 from torch import nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf, runai_safetensors_weights_iterator)
 from vllm.transformers_utils.s3_utils import glob as s3_glob
@@ -100,11 +99,8 @@ class ShardedStateLoader(BaseModelLoader):
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
         from vllm.distributed import get_tensor_model_parallel_rank
 
         model_weights = model_config.model
@@ -112,53 +108,47 @@ class ShardedStateLoader(BaseModelLoader):
             model_weights = model_config.model_weights
         local_model_path = model_weights
 
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-                process_weights_after_loading(model, model_config,
-                                              target_device)
-            rank = get_tensor_model_parallel_rank()
-            pattern = os.path.join(
-                local_model_path,
-                self.pattern.format(rank=rank, part="*"),
-            )
-
-            filepaths = []
-            if is_s3(local_model_path):
-                file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
-                filepaths = s3_glob(path=local_model_path,
-                                    allow_pattern=[file_pattern])
-            else:
-                filepaths = glob.glob(pattern)
-            if not filepaths:
-                # TODO: support un-sharded checkpoints too
-                raise ValueError(
-                    f"Could not find checkpoint files '{pattern}', only "
-                    f"pre-sharded checkpoints are currently supported!")
-            state_dict = self._filter_subtensors(model.state_dict())
-            for key, tensor in self.iterate_over_files(filepaths):
-                # If loading with LoRA enabled, additional padding may
-                # be added to certain parameters. We only load into a
-                # narrowed view of the parameter data.
-                param_data = state_dict[key].data
-                param_shape = state_dict[key].shape
-                for dim, size in enumerate(tensor.shape):
-                    if size < param_shape[dim]:
-                        param_data = param_data.narrow(dim, 0, size)
-                if tensor.shape != param_shape:
-                    logger.warning(
-                        "loading tensor of shape %s into "
-                        "parameter '%s' of shape %s",
-                        tensor.shape,
-                        key,
-                        param_shape,
-                    )
-                param_data.copy_(tensor)
-                state_dict.pop(key)
-            if state_dict:
-                raise ValueError(
-                    f"Missing keys {tuple(state_dict)} in loaded state!")
-        return model.eval()
+        rank = get_tensor_model_parallel_rank()
+        pattern = os.path.join(
+            local_model_path,
+            self.pattern.format(rank=rank, part="*"),
+        )
+
+        filepaths = []
+        if is_s3(local_model_path):
+            file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
+            filepaths = s3_glob(path=local_model_path,
+                                allow_pattern=[file_pattern])
+        else:
+            filepaths = glob.glob(pattern)
+        if not filepaths:
+            # TODO: support un-sharded checkpoints too
+            raise ValueError(
+                f"Could not find checkpoint files '{pattern}', only "
+                f"pre-sharded checkpoints are currently supported!")
+        state_dict = self._filter_subtensors(model.state_dict())
+        for key, tensor in self.iterate_over_files(filepaths):
+            # If loading with LoRA enabled, additional padding may
+            # be added to certain parameters. We only load into a
+            # narrowed view of the parameter data.
+            param_data = state_dict[key].data
+            param_shape = state_dict[key].shape
+            for dim, size in enumerate(tensor.shape):
+                if size < param_shape[dim]:
+                    param_data = param_data.narrow(dim, 0, size)
+            if tensor.shape != param_shape:
+                logger.warning(
+                    "loading tensor of shape %s into "
+                    "parameter '%s' of shape %s",
+                    tensor.shape,
+                    key,
+                    param_shape,
+                )
+            param_data.copy_(tensor)
+            state_dict.pop(key)
+        if state_dict:
+            raise ValueError(
+                f"Missing keys {tuple(state_dict)} in loaded state!")
 
     def iterate_over_files(
             self, paths) -> Generator[tuple[str, torch.Tensor], None, None]:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 4c4502284a6afc032b3c85ed268dd181c46ebfac..24d1e136539a77988ac8802cb555cb051706aa28 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import contextlib
@@ -21,7 +22,8 @@ from torch.utils._python_dispatch import TorchDispatchMode
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
+from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -208,12 +210,6 @@ class TensorizerConfig:
                            **tensorizer_args.stream_params)
 
 
-def load_with_tensorizer(tensorizer_config: TensorizerConfig,
-                         **extra_kwargs) -> nn.Module:
-    tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs)
-    return tensorizer.deserialize()
-
-
 @dataclass
 class TensorizerArgs:
     tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
@@ -366,100 +362,72 @@ class TensorizerArgs:
         return tensorizer_args
 
 
-class TensorizerAgent:
-    """
-    A class for performing tensorizer deserializations specifically for
-    vLLM models using plaid_mode. Uses TensorizerArgs to configure the
-    behavior of the TensorDeserializer when loading tensors from a serialized
-    model. For deserializations of HuggingFace models, TensorDeserializer is
-    instead used as an iterator directly in the func hf_model_weights_iterator
-    in vllm/model_executor/model_loader/weight_utils.py
-    """
-
-    def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
-        self.tensorizer_config = tensorizer_config
-        self.tensorizer_args = (
-            self.tensorizer_config._construct_tensorizer_args())
-        self.vllm_config = vllm_config
-        self.model = self._init_model()
-
-    def _init_model(self):
-        assert self.tensorizer_config.hf_config is not None
-        model_args = self.tensorizer_config.hf_config
-        model_args.torch_dtype = self.tensorizer_config.dtype
-        assert self.tensorizer_config.model_class is not None
-        # TODO: Do we need to consider old-style model class?
-        with meta_tensor_mode(), set_current_vllm_config(self.vllm_config,
-                                                         check_compile=True):
-            return self.tensorizer_config.model_class(
-                vllm_config=self.vllm_config)
-
-    def _resize_lora_embeddings(self):
-        """Modify LoRA embedding layers to use bigger tensors
-        to allow for adapter added tokens."""
-        for child in self.model.modules():
-            if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0]
-                    < child.num_embeddings_per_partition):
-                new_weight = torch.empty(child.num_embeddings_per_partition,
-                                         child.embedding_dim,
-                                         dtype=child.weight.dtype,
-                                         device=child.weight.device)
-                new_weight[:child.weight.shape[0]].copy_(child.weight.data)
-                new_weight[child.weight.shape[0]:].fill_(0)
-                child.weight.data = new_weight
-
-    def _check_tensors_on_meta_device(self):
-        for tensor in self.model.state_dict().values():
-            if tensor.device.type == 'meta':
-                raise ValueError(
-                    "The serialized model contains tensors on the meta device,"
-                    " indicating that some tensors were not loaded properly."
-                    " Please check that the parameters of the model being"
-                    " specified match that of the serialized model, such as"
-                    " its quantization.")
-
-    def deserialize(self):
-        """
-        Deserialize the model using the TensorDeserializer. This method is
-        specifically for vLLM models using tensorizer's plaid_mode.
-
-        The deserializer makes use of tensorizer_args.stream_params
-        to configure the behavior of the stream when loading tensors from a
-        serialized model. The deserializer_params are used to configure the
-        behavior of the TensorDeserializer when loading tensors themselves.
-        Documentation on these params can be found in TensorizerArgs
-
-        Returns:
-            nn.Module: The deserialized model.
-        """
-        before_mem = get_mem_usage()
-        start = time.perf_counter()
-        with _read_stream(
-                self.tensorizer_config.tensorizer_uri,
-                **self.tensorizer_args.stream_params
-        ) as stream, TensorDeserializer(
+def _check_tensors_on_meta_device(model: nn.Module) -> None:
+    for tensor in model.state_dict().values():
+        if tensor.device.type == 'meta':
+            raise ValueError(
+                "The serialized model contains tensors on the meta device,"
+                " indicating that some tensors were not loaded properly."
+                " Please check that the parameters of the model being"
+                " specified match that of the serialized model, such as"
+                " its quantization.")
+
+
+def _resize_lora_embeddings(model: nn.Module):
+    """Modify LoRA embedding layers to use bigger tensors
+    to allow for adapter added tokens."""
+    for child in model.modules():
+        if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0]
+                < child.num_embeddings_per_partition):
+            new_weight = torch.empty(child.num_embeddings_per_partition,
+                                     child.embedding_dim,
+                                     dtype=child.weight.dtype,
+                                     device=child.weight.device)
+            new_weight[:child.weight.shape[0]].copy_(child.weight.data)
+            new_weight[child.weight.shape[0]:].fill_(0)
+            child.weight.data = new_weight
+
+
+def init_tensorizer_model(tensorizer_config: TensorizerConfig,
+                          vllm_config: VllmConfig) -> nn.Module:
+    assert tensorizer_config.hf_config is not None
+    model_args = tensorizer_config.hf_config
+    model_args.torch_dtype = tensorizer_config.dtype
+    assert tensorizer_config.model_class is not None
+    # TODO: Do we need to consider old-style model class?
+    with meta_tensor_mode(), set_current_vllm_config(vllm_config,
+                                                     check_compile=True):
+        return tensorizer_config.model_class(vllm_config=vllm_config)
+
+
+def deserialize_tensorizer_model(model: nn.Module,
+                                 tensorizer_config: TensorizerConfig) -> None:
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+    before_mem = get_mem_usage()
+    start = time.perf_counter()
+    with _read_stream(
+            tensorizer_config.tensorizer_uri,
+            **tensorizer_args.stream_params) as stream, TensorDeserializer(
                 stream,
-                dtype=self.tensorizer_config.dtype,
+                dtype=tensorizer_config.dtype,
                 device=f'cuda:{torch.cuda.current_device()}',
-                **self.tensorizer_args.deserializer_params) as deserializer:
-            deserializer.load_into_module(self.model)
-            end = time.perf_counter()
-
-        total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
-        duration = end - start
-        per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
-        after_mem = get_mem_usage()
-        deserializer.close()
-        logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
-                    end - start, per_second)
-        logger.info("Memory usage before: %s", before_mem)
-        logger.info("Memory usage after: %s", after_mem)
-
-        self._check_tensors_on_meta_device()
-        self._resize_lora_embeddings()
-        del self.model.vllm_tensorized_marker
-        return self.model.eval()
+                **tensorizer_args.deserializer_params) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.perf_counter()
+
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    deserializer.close()
+    logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
+                end - start, per_second)
+    logger.info("Memory usage before: %s", before_mem)
+    logger.info("Memory usage after: %s", after_mem)
+
+    _check_tensors_on_meta_device(model)
+    _resize_lora_embeddings(model)
+    del model.vllm_tensorized_marker
 
 
 def tensorizer_weights_iterator(
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 2afe2b59e2f9a1a83743e984bd2e48d312cd6baf..b9982f312fe529adc5d579de36a07f6b2785ec10 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import copy
 from collections.abc import Generator
@@ -11,8 +12,8 @@ from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.tensorizer import (
-    TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
-    serialize_vllm_model, tensorizer_weights_iterator)
+    TensorizerConfig, deserialize_tensorizer_model, init_tensorizer_model,
+    is_vllm_tensorized, serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     initialize_model,
                                                     set_default_torch_dtype)
@@ -61,38 +62,34 @@ class TensorizerLoader(BaseModelLoader):
             model.load_weights(self._get_weights_iterator())
         return model.eval()
 
-    def _load_model_serialized(
-        self,
-        vllm_config: VllmConfig,
-    ) -> nn.Module:
-        """Load a serialized model with tensorizer.
-
-        Expects a vLLM-tensorized model. See the
-        examples/others/tensorize_vllm_model.py example script
-        for serializing vLLM models."""
-
-        device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
-
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-                model_class = get_model_architecture(model_config)[0]
-
-                tensorizer_config = copy.copy(self.tensorizer_config)
-                tensorizer_config.model_class = model_class
-                tensorizer_config.hf_config = model_config.hf_config
-                tensorizer_config.dtype = model_config.dtype
-
-                model = load_with_tensorizer(tensorizer_config,
-                                             vllm_config=vllm_config)
-        return model.eval()
-
     def download_model(self, model_config: ModelConfig) -> None:
         self.tensorizer_config.verify_with_model_config(model_config)
 
         with self.tensorizer_config.open_stream():
             pass
 
+    def _patch_tensorizer_config(
+            self, model_config: ModelConfig) -> TensorizerConfig:
+        model_class = get_model_architecture(model_config)[0]
+        tensorizer_config = copy.copy(self.tensorizer_config)
+        tensorizer_config.model_class = model_class
+        tensorizer_config.hf_config = model_config.hf_config
+        tensorizer_config.dtype = model_config.dtype
+        return tensorizer_config
+
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load serialized model weights with tensorizer.
+
+        Expects a vLLM-tensorized model. See the
+        examples/others/tensorize_vllm_model.py example script
+        for serializing vLLM models."""
+        if is_vllm_tensorized(self.tensorizer_config):
+            tensorizer_config = self._patch_tensorizer_config(model_config)
+            deserialize_tensorizer_model(model, tensorizer_config)
+        else:
+            model.load_weights(self._get_weights_iterator())
+
     def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         parallel_config = vllm_config.parallel_config
@@ -106,7 +103,11 @@ class TensorizerLoader(BaseModelLoader):
                 get_tensor_model_parallel_rank())
 
         if is_vllm_tensorized(self.tensorizer_config):
-            return self._load_model_serialized(vllm_config=vllm_config)
+            tensorizer_config = self._patch_tensorizer_config(model_config)
+            model = init_tensorizer_model(tensorizer_config=tensorizer_config,
+                                          vllm_config=vllm_config)
+            self.load_weights(model, model_config)
+            return model
         return self._load_model_serialized_cpu(vllm_config=vllm_config)
 
     @staticmethod
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6197bcdba826b0d2f25486392b1fcb5662719623
--- /dev/null
+++ b/vllm/model_executor/model_loader/tpu.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+import time
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
+
+from vllm.config import ModelConfig, VllmConfig
+from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
+
+logger = init_logger(__name__)
+
+
+class TPUModelLoader(DefaultModelLoader):
+    """
+    A TPU model loader for model loading under SPMD mode.
+    """
+
+    def load_model(
+        self,
+        vllm_config: VllmConfig,
+        model_config: ModelConfig,
+        mesh: Optional[xs.Mesh] = None,
+    ) -> nn.Module:
+        # Initialize model and load weights on CPU. Then, during SPMD partition,
+        # weights are sharded and transferred to TPUs.
+        self.counter_before_loading_weights = time.perf_counter()
+        model_config = vllm_config.model_config
+        assert model_config.quantization is None, "Quantization not supported"
+        target_device = torch.device('cpu')
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config)
+
+            load_format = vllm_config.load_config.load_format
+            if load_format != "dummy":
+                weights_to_load = {
+                    name
+                    for name, _ in model.named_parameters()
+                }
+                all_weights = self.get_all_weights(model_config, model)
+                loaded_weights = model.load_weights(all_weights)
+                self.counter_after_loading_weights = time.perf_counter()
+                logger.info(
+                    "Loading weights took %.2f seconds",
+                    self.counter_after_loading_weights -
+                    self.counter_before_loading_weights)
+                # We only enable strict check for non-quantized models
+                # that have loaded weights tracking currently.
+                if model_config.quantization is None and \
+                    loaded_weights is not None:
+                    weights_not_loaded = weights_to_load - loaded_weights
+                    if weights_not_loaded:
+                        raise ValueError(
+                            "Following weights were not initialized from "
+                            f"checkpoint: {weights_not_loaded}")
+            else:
+                logger.info("Use dummy weight during weight loading.")
+
+            process_weights_after_loading(model, model_config, target_device)
+
+        counter_before_partition = time.perf_counter()
+        model = model.eval()
+        model = model.to('xla')
+        shard_model(model, mesh)
+        counter_after_partition = time.perf_counter()
+        logger.info("Partition model took %.2f seconds",
+                    counter_after_partition - counter_before_partition)
+
+        # Ensure the model is properly loaded.
+        self._check_model_is_loaded(mesh, model)
+
+        # Need to torch compile after model sharding are done. Because the
+        # compiler hints ('xs.mark_sharding') are torch ops.
+        if not model_config.is_multimodal_model:
+            model.model = torch.compile(model.model, backend="openxla")
+        else:
+            model.language_model.model = \
+                torch.compile(model.language_model.model, backend="openxla")
+        return model
+
+    def _check_model_is_loaded(self, mesh: Optional[xs.Mesh],
+                               model: nn.Module) -> None:
+        """
+        Ensure the model is properly loaded.
+        1. All model parameters and buffers are on XLA device.
+        2. Non-SPMD friendly layers are replaced as expected.
+        """
+        device = xm.xla_device()
+        device_type = str(device.type)
+
+        # Check parameters
+        for name, param in model.named_parameters():
+            assert param.device.type == device_type, f"Parameter {name} is on \
+                {param.device.type} instead of {device_type}"
+
+        # Check buffers
+        for name, buffer in model.named_buffers():
+            assert buffer.device.type == device_type, \
+                f"Buffer {name} is on {buffer.device.type} instead of \
+                    {device_type}"
+
+        for module in model.modules():
+            if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'):
+                raise AssertionError("QKVParallelLinear should be replaced by \
+                            XlaQKVParallelLinear under SPMD mode.")
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 9c8d647a24feaf6d4a9d2e0624a3a8a019e698ea..e6eaade090275954d7c380b6364ebe4a280d9b3e 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading models."""
 import contextlib
 import inspect
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f61956f4e8e01e9019b9b40ebee743b7456b4272..857f4bca6824525d02ba9d3f65c8d770743bd7e0 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for downloading and initializing model weights."""
 import fnmatch
 import glob
@@ -696,7 +697,7 @@ def initialize_dummy_weights(
                 # Note: We avoid using torch.rank_like as it doesn't currently
                 # support the generator argument.
                 param.copy_((high - low) *
-                            torch.rand(*param.shape,
+                            torch.rand(param.shape,
                                        generator=generator,
                                        dtype=param.dtype,
                                        layout=param.layout,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 3580c4fa5252523914b95dee96253f55d09ad92c..27c169d2d1e81580a4169bd74ce47cf39c77dfe1 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, SupportsV0Only, has_inner_state,
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 6ab03c40ab4a2c07f6b60721e369a4e572a528c2..1651e3e429e64601babfd993287976c623ea9963 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Optional, TypeVar
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index aefd6c97375524c8354708db95db5a1f9c062ac5..b13d863ebb744e7cf407ed3a09dd8ebe3b9beec8 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -1,17 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # A modified implementation of the AIMv2 Transformer
 # inserted here also the image tokenizer used by Ovis2
+from collections.abc import Iterable
 from typing import Optional
 
 import torch
 import torch.nn as nn
-from torch.nn import functional as F
 
+from vllm.attention.layer import MultiHeadAttention
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.utils import divide
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.transformers_utils.configs.ovis import AIMv2Config
 
 
@@ -24,29 +32,27 @@ class AIMv2SwiGLUFFN(nn.Module):
         in_features = config.hidden_size
         bias = config.use_bias
 
-        # TODO(Isotr0py): investigate if we can add TP to visual tokenizer
-        self.fc1 = ReplicatedLinear(in_features,
-                                    hidden_features,
-                                    bias=bias,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.fc1")
-        self.fc2 = ReplicatedLinear(hidden_features,
-                                    in_features,
-                                    bias=bias,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.fc2")
-        self.fc3 = ReplicatedLinear(in_features,
-                                    hidden_features,
-                                    bias=bias,
-                                    quant_config=quant_config,
-                                    prefix=f"{prefix}.fc3")
+        self.fc13 = MergedColumnParallelLinear(
+            in_features,
+            [hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc13",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=hidden_features,
+            output_size=in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_parallel, _ = self.fc1(x)
-        gate, _ = self.fc3(x)
-        x_parallel = F.silu(x_parallel) * gate
-        out, _ = self.fc2(x_parallel)
-        return out
+        x, _ = self.fc13(x)
+        x = self.act_fn(x)
+        x, _ = self.fc2(x)
+        return x
 
 
 class AIMv2PatchEmbed(nn.Module):
@@ -90,39 +96,45 @@ class AIMv2Attention(nn.Module):
     def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
                  prefix: str):
         super().__init__()
-        dim = config.hidden_size
-
-        # TODO(Isotr0py): investigate if we can add TP to visual tokenizer
+        self.config = config
+        self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.qkv = ReplicatedLinear(dim, dim * 3, bias=config.qkv_bias)
-        # self.qkv = QKVParallelLinear(
-        #               hidden_size=dim,
-        #               head_size=dim // config.num_attention_heads,
-        #               total_num_heads=config.num_attention_heads,
-        #               bias=config.qkv_bias,
-        #               quant_config=quant_config,
-        #               prefix=f"{prefix}.qkv")
-        self.proj = ReplicatedLinear(dim, dim, bias=config.use_bias)
-        # self.proj = RowParallelLinear(input_size=dim,
-        #                  output_size=dim,
-        #                  bias = config.use_bias,
-        #                  quant_config=quant_config,
-        #                  prefix=f"{prefix}.proj")
-
-    def forward(  # todo might implement multiple attn implementations
-            self,
-            x: torch.Tensor,
-            mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        B, N, C = x.shape
-        qkv, _ = self.qkv(x)
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+
+        self.proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
-        qkv = qkv.reshape(B, N, 3, self.num_heads,
-                          C // self.num_heads).permute(2, 0, 3, 1, 4)
+        self.attn = MultiHeadAttention(self.num_heads_per_partition,
+                                       self.head_dim, self.scale)
 
-        q, k, v = qkv.unbind(0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
 
-        x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
-        x = x.transpose(1, 2).contiguous().reshape(B, N, C)
+        x = self.attn(q, k, v)
         x, _ = self.proj(x)
         return x
 
@@ -141,37 +153,40 @@ class AIMv2Block(nn.Module):
                                   prefix=f"{prefix}.mlp")
         self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def forward(self,
-                x: torch.Tensor,
-                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        x = x + self.attn(self.norm_1.forward_native(x), mask)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm_1.forward_native(x))
         x = x + self.mlp(self.norm_2.forward_native(x))
         return x
 
 
 class AIMv2Transformer(nn.Module):
 
-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(
+        self,
+        config: AIMv2Config,
+        quant_config: QuantizationConfig,
+        *,
+        require_post_norm: Optional[bool] = None,
+        prefix: str = "",
+    ):
         super().__init__()
 
         self.blocks = nn.ModuleList([
             AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}")
             for i in range(config.num_hidden_layers)
         ])
-        self.post_trunk_norm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
+        if require_post_norm:
+            self.post_trunk_norm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        else:
+            self.post_trunk_norm = None
 
-    def forward(
-        self,
-        tokens: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
         # they take the -1 as the ref embeddings, like a clip skip
         for block in self.blocks:
-            tokens = block(tokens, mask)
-        # NO NORM IN THE OG IMPLEMENTATION
-        # tokens = self.post_trunk_norm(tokens)
+            tokens = block(tokens)
+        if self.post_trunk_norm is not None:
+            tokens = self.post_trunk_norm(tokens)
         return tokens
 
 
@@ -180,20 +195,52 @@ class AIMv2Model(torch.nn.Module):
     def __init__(self,
                  config: AIMv2Config,
                  quant_config: QuantizationConfig,
+                 *,
+                 require_post_norm: Optional[bool] = None,
                  prefix: str = ""):
         super().__init__()
         self.preprocessor = AIMv2ViTPreprocessor(config)
         self.trunk = AIMv2Transformer(config,
                                       quant_config=quant_config,
+                                      require_post_norm=require_post_norm,
                                       prefix=f"{prefix}.trunk")
 
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         x = self.preprocessor(pixel_values)
-        x = self.trunk(x, mask)
+        x = self.trunk(x)
 
         return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".fc13", ".fc1", 0),
+            (".fc13", ".fc3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            # post_layernorm is optional in SiglipVisionModel
+            if (name.startswith("trunk.post_trunk_norm")
+                    and self.trunk.post_trunk_norm is None):
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 94a4328564bbb6de407d26ec92085de1a53e6b0f..4693c9487a8bf1e1cda84bf1b2080c4b84c4e27f 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Snowflake Arctic model."""
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index f74e13888c48edbd2b792b3da067f98962cbc582..bb4177dfc45744c851f748bfe1066cba0ac3dcdc 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 08d49d71eca12f4efdce7403e3fb7111a15e2275..7e15e57a4d03219d7ffd42c243a9a4ae64e4d15a 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0 Adapted from
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from
 # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union, cast
@@ -110,7 +111,13 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
         return self.ctx.get_hf_config(AyaVisionConfig)
 
     def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
-        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+        # Temporary workaround since this processor has multiple image tokens
+        # See https://github.com/huggingface/transformers/issues/38350
+        processor._check_special_mm_tokens = lambda *args, **kwargs: None
+
+        return processor
 
     def get_image_processor(self) -> GotOcr2ImageProcessor:
         return self.get_hf_processor().image_processor
@@ -187,9 +194,7 @@ class AyaVisionMultiModalProcessor(
         image_processor = hf_processor.image_processor
 
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
-        if (images :=
-                mm_data.get("images")) is not None and '<image>' in prompt:
-            assert isinstance(images, list)
+        if (images := mm_data.get("images")) is not None:
             parsed_images = (self._get_data_parser().parse_mm_data({
                 "image":
                 images
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index bcff6eb3fd315413b5f024bd92f1a3ed124baf15..0de5de5e835ac1493eb9dfd980188c918a6d4146 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index d6a705fb1859a3660c60cc371017ba85bf99359c..29e0e2a2edb15e759d4f9f76c92a9496e9969610 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Bamba model."""
 # Added by the IBM Team, 2024
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 92bbe1bb67a3cc5d7773c1827b403f12491b30f0..a0ec12674f19bbe3cbe2c3f6c9c5395eb44c7a90 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Derived from BART implementation posted on HuggingFace; license below:
 #
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 0c6593bbe3a100dced04f82ca043deab8bba8080..cacec7342ac2e55d5088343d4d93eee6117a8362 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
@@ -16,7 +17,7 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
+from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
                                                PoolingType)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -413,10 +414,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self.model(input_ids=input_ids,
-                          position_ids=positions,
-                          inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors)
+        hidden_states = self.model(input_ids=input_ids,
+                                   position_ids=positions,
+                                   inputs_embeds=inputs_embeds,
+                                   intermediate_tensors=intermediate_tensors)
+
+        # convert the embedding output to float32,
+        # otherwise precision will be lost significantly
+        hidden_states = hidden_states.to(torch.float32)
+        return hidden_states
 
     def pooler(
         self,
@@ -470,8 +476,8 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
                               embedding_class=BertEmbedding,
                               add_pooling_layer=True)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self._pooler = CrossEncodingPooler(config, self.classifier,
-                                           self.bert.pooler)
+        self._pooler = ClassifierPooler(vllm_config.model_config,
+                                        self.classifier, self.bert.pooler)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index af6deb3bf072e0030442919945b7ba57a7fdd9da..d1b84a9f04fa9b22bb9ff9e1d9207b919692fba6 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
+from copy import deepcopy
 from typing import Optional
 
 import torch
@@ -10,6 +12,7 @@ from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
                                                    get_act_fn)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -27,6 +30,8 @@ from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.sequence import IntermediateTensors
 
+logger = init_logger(__name__)
+
 
 class BertWithRopeEmbedding(nn.Module):
 
@@ -427,7 +432,12 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
         else:
             hidden_states = self.embeddings(input_ids=input_ids,
                                             token_type_ids=token_type_ids)
-        return self.encoder(positions, hidden_states)
+        hidden_states = self.encoder(positions, hidden_states)
+
+        # convert the embedding output to float32,
+        # otherwise precision will be lost significantly
+        hidden_states = hidden_states.to(torch.float32)
+        return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
@@ -513,10 +523,11 @@ class NomicBertModel(BertWithRope):
 
         head_dim = config.hidden_size // config.num_attention_heads
         rotary_emb_dim = head_dim * config.rotary_emb_fraction
+        max_trained_positions = getattr(config, "max_trained_positions", 2048)
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
-            "max_position": config.max_trained_positions,
+            "max_position": max_trained_positions,
             "base": getattr(config, "rope_theta", config.rotary_emb_base),
             "rope_scaling": getattr(config, "rope_scaling", None)
         }
@@ -525,8 +536,52 @@ class NomicBertModel(BertWithRope):
         # than max_trained_positions 2048, the results are consistent
         # with SentenceTransformer.
         # The context extension uses vllm style rope_theta and rope_scaling.
-        # See #17785
-
+        # See #17785 #18755
+        if (not vllm_config.model_config.hf_overrides
+                and vllm_config.model_config.original_max_model_len is None):
+            # Default
+            # Reset max_model_len to max_trained_positions.
+            # nomic-embed-text-v2-moe the length is set to 512
+            # by sentence_bert_config.json.
+            max_model_len_before = vllm_config.model_config.max_model_len
+            max_model_len = min(vllm_config.model_config.max_model_len,
+                                max_trained_positions)
+
+            vllm_config.recalculate_max_model_len(max_model_len)
+            logger.warning(
+                "Nomic context extension is disabled. "
+                "Changing max_model_len from %s to %s. "
+                "To enable context extension, see: "
+                "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html",
+                max_model_len_before, vllm_config.model_config.max_model_len)
+        else:
+            # We need to re-verify max_model_len to avoid lengths
+            # greater than position_embedding.
+            model_config = vllm_config.model_config
+            hf_text_config = model_config.hf_text_config
+
+            if isinstance(model_config.hf_overrides, dict):
+                # hf_overrides_kw
+                max_model_len = model_config.hf_overrides.get(
+                    "max_model_len", vllm_config.model_config.max_model_len)
+            else:
+                # hf_overrides_fn
+                # This might be overridden by sentence_bert_config.json.
+                max_model_len = vllm_config.model_config.max_model_len
+
+            # reset hf_text_config for recalculate_max_model_len.
+            if hasattr(hf_text_config, "max_model_len"):
+                delattr(hf_text_config, "max_model_len")
+            hf_text_config.max_position_embeddings = max_trained_positions
+            hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
+
+            # The priority of sentence_bert_config.json is higher
+            # than max_position_embeddings
+            encoder_config = deepcopy(model_config.encoder_config)
+            encoder_config.pop("max_seq_length", None)
+            model_config.encoder_config = encoder_config
+
+            vllm_config.recalculate_max_model_len(max_model_len)
         return config
 
 
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index acbc5d04d7e359f52e2eb8e37e0ef4d61ce217c2..2b457fd8a5b25eed5ee77cde02972812ebe5dab9 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index db0dd2051d527ffdcb3c5033e07a223a0c544873..279541bed55a0051f2523c38504814f344947b15 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 10424e218fbccfb0597cf4fc414688acd9bfe11a..6e4a399f3cc6e12c917fa8d3333feeb0b2cc53f8 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a4528ca26d0101dfb23c5f9ab00df63bdfa3d99a..aea44261dd69f299e66df172dae49e5b2bdefd41 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 4e95afe1a14746a587b9eb9489d8a956cde59f7e..129f0942f14efb894b68d4180614af45b3d2a4c8 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index e8f3ae2156e02b4425d66052dfab006b288bf3bc..dcab0082287046a4870361f15ba8a6f9bdf7d3f6 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
 from collections.abc import Iterable
@@ -106,7 +107,6 @@ class CLIPAttention(nn.Module):
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads}).")
         self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
@@ -129,10 +129,6 @@ class CLIPAttention(nn.Module):
         self.attn = MultiHeadAttention(self.num_heads_per_partition,
                                        self.head_dim, self.scale)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 546b5f932877d446121a191d844abac7979f5143..ee67cc64050e79641be89f4c12b08f899c797b11 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py
index f1cc7e0f9e293fb8d67b2087249a59d27c066415..f03c58a12932faba1207a76d7f76e8fdd95bf331 100644
--- a/vllm/model_executor/models/constant_size_cache.py
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Any
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index f21887f71d8579ac3181fdc25b1daad175c7e460..7a4dd69443ad715abb25a7338a96c7c142142dee 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 88d1ca9f7b8334242bf8263daa6e14b543a82eb2..2f0202f1e038d22a01f56637e15f1a10fbe5aa7f 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 03ef7bed0edcfcd6ef88421c2689d89c6c5c3ca1..6e6e74b0d1d9b881ed8a4d0f7a7028771bdfdac0 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Optional
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d7b5742e35a41ebf6d882136189d42b284ba06e4..464e00d983a6638d250c627fe47960a53543dd3a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 5c8793f59ffbe00455661d8f2ba4573eb1c531a1..765718e5752033fcc1a5a03591db4044a8713e8b 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index fb1675d29915d539e4da91f4a4da6698ad4c24c4..2219321457b2ae544c50d31d4f6f50f062a3d7d4 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 838560692bcf5c89c79dda568c14311bac461851..aaf105ec2552a492a0bd5906e457f253fce3ebe4 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index 00dbbebb120e8d344263a6e34db9181530975256..d78ee100b26df8cecbbeed7acba1decd18289688 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 376793594f8ba1b35f048d71b95e18076edf3f41..62a93dabd5d7f9a278e4d568cc7d12314668c886 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 1c0e3911fccee8437d695d84545af6cf4b65e5d5..28f257eabed0152c8c53009a3835df7c0babfc3e 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only FalconH1 model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index f8acc56706d2b37b83a7af6d285f9a5816d4d141..47760aabb959193c582c66f7683023131830a484 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections import OrderedDict
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index fbad7f56d0ba7aedc19faf1a46078843abd83916..cb141dbc5aa376cdc45d0b86258117d0e7fbd18e 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 0f6d94e7518bba872c4894d79b10136a566b4507..99ed51f8e70af1ac23c428f84bcdc325e955c069 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index b46716213c626f0aa8dc187a73def7365d1f34ab..ce405041b3d4a5ed263d6506ccba9ba1e665cf7e 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 3a88adcce0bdd4db806ee7cfc267ab41673e8e68..e19e0026b3f99ca1d538e7b3e69a62d04c02fa9a 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2025 The vLLM team.
 # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 182cc86d3ca8f470ab1c76469c0848d9f11147a9..23e25170799ba6400154110154993ef26223d1da 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 6269ebcee5c08b0e5a4244231143b4c90408e847..defa77b84e44153f330f367bf8d9e6b954ecfeee 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only HF format GLM-4 model compatible with THUDM weights."""
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f351ce5a06810a4f312cc1be0a04a2b3ea747ef7..5e2908a82c41858874a92053039546046dfab73f 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2025 The Zhipu AI team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 4e13716719ace3288b0ea3e9e2444337eb178f5b..034c7654f4d94e93a938a2d21b63a6edfcea00fb 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/THUDM/CogAgent
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index c2c310fca4d944ee518c20632181adbc448e008d..fd3decbaebec49ab24de923b8f55b469aee29379 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index c4ae4fc3c0062e7a372c832bc4f5550a7eb368fd..661a67bdc0db010a87f959ff3fdcd85ead8082b9 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 69fdd90cfbe8b17d9a254c4d08078bc6404f2cbd..bd162a5e57bc18ab84cde42cec832b5def14b465 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 401fa9f5cc8bc738b01887208965740e083bda0f..d418d8bb86cee1742a411670816ee25f085e43c0 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3524d036db222a6c2b54549ca4d56d804dadff55..bd4d5d0b6b28acba451a4f790d707d7e5cc6e842 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index fd8fb48c50e3a40417d731d96b9cecb4d6c23079..831164ba88a4d86e0e0f23d5364e13c4ad97efa4 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index f342dfff824f074c112221d8b0847d94b2d6b187..5a70f3a616c6d0e1160c74326880e647f2d343d0 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 443b102c99680be5cef7c45eb12822a737cacc40..f434b7a74e4861fe8aa67d075611608361db8bcd 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeHybrid model."""
 # Added by the IBM Team, 2025
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 817e6091d276a4a89a5dbf663bae9dd944600649..bb160dbce45b29f17918c276b520a6a3998985ec 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeShared model.
 
 The architecture is the same as granitemoe but with the addition of shared
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 6a444e8d1068c64095eb2d9ecf36b361a10c9c46..4273afbf46998f6ddb047cd5263d72d8112300f4 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from typing import Optional
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index bc9e9a3c0206478f342b60174242ff38c1cb25b4..2d930527b2be0c9bb7a43e507f48cf49611ea515 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
 # https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 904f5330c653ef9bada22ed70fa69a35700de473..8f7f359b755216c9987f1218d45662ec69a9d6d9 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index b8bdc7aa32b25cd59f1dcfceaf095dd881665cc2..9e27200fb1c8982da8c1ab06e758684458a8a847 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index fdb128ef5b541f176967b1f5ccb59826b7e304a8..de8596282ca9ce40da324b5ea02cccd74518d1d9 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
@@ -21,8 +22,8 @@ from typing import Literal, Optional, TypedDict, Union
 
 import torch
 from torch import nn
-from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
-                          Idefics3Processor)
+from transformers import (AddedToken, BatchFeature, Idefics3Config,
+                          Idefics3ImageProcessor, Idefics3Processor)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -198,13 +199,21 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
 
         return grid_w * grid_h + 1
 
+    # TODO: Remove after requiring transformers>=4.52
+    def _get_content(self, token: Union[AddedToken, str]) -> str:
+        if isinstance(token, str):
+            return token
+
+        return token.content
+
     def _get_image_token(
             self,
             processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
         if processor is None:
             processor = self.get_hf_processor()
-        image_token = processor.image_token.content
-        fake_image_token = processor.fake_image_token.content
+
+        image_token = self._get_content(processor.image_token)
+        fake_image_token = self._get_content(processor.fake_image_token)
         global_image_token = processor.global_image_tag
         return image_token, fake_image_token, global_image_token
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 8be8841c1f6c9e5b2063107e76ae16eaf3078127..cb2a4062b84cf5fa4d39b9ac7b345318fb8a5d96 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index d325a6b6713284cb77e9ca6045219791a8574c4a..4a1ea74a218a47cc4daf2554181319d41878e8a9 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload,
                     runtime_checkable)
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index d9d9002bd5baad2dad0de187f2011732b0281b91..58e8163e0b26e409c877194ab4df3d21c2cce0ff 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
 # --------------------------------------------------------
@@ -415,6 +416,10 @@ class InternVisionEncoder(nn.Module):
 
 class InternVisionModel(nn.Module):
 
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
     def __init__(
         self,
         config: PretrainedConfig,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 3f3e3966e838a2b2018835a21b7603cdff9471c9..e8549b4e05384633f315ae16a6c2b7a8a06cced2 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from functools import partial
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6893d0239121d5c514be32f6fc73fbbc29cb2a96..4bbb49da0e96f2b15a519ff5fa133ea86d5e2cdd 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 4612fc4387412b197a002f0a342bfad9435a1d2c..0c61369c5f51828ba37d1b3dc3f89fff5c9272c5 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
 # --------------------------------------------------------
@@ -22,6 +23,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
@@ -36,7 +38,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -1014,7 +1017,8 @@ class InternVLMultiModalProcessor(
     InternVLMultiModalProcessor,
     info=InternVLProcessingInfo,
     dummy_inputs=InternVLDummyInputsBuilder)
-class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
+                        SupportsLoRA):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
@@ -1403,3 +1407,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         ]
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="mlp1",
+            tower_model="vision_model")
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index d6a1e0bb4845445165ae0fae8864e8e8f8ee5189..bed4a5dff2efae1a10d25b10c709e2150fa8eccc 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6f9fa60c9b05e6081825168139bfdb8533f12f3b..8294f846bbd1038f1314b9c1ee48acda20f93a9d 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jamba model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index b575f44765a890c382b74771b32f5e97cfcd1bc5..351d1fbdc744497a7e51c5c413f76da68048ce74 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
 # Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d36b6466c0bb97bc5e25b31d8ecde04c50bc6c35..5d5080479e51016727d8695d0368e7146ba663e8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 40fdd84d8fb08fee24c467b0938fc2231a6674f7..a852be66bde827d220c2304a98e69f4736d24efe 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 172dc8b5ec06a3d446b3e8656ad6a6bacc7f0705..c7690604c1d09bfc2377329bed7ba02a7fdc09ec 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 
@@ -54,13 +55,11 @@ class LlamaModel(nn.Module):
             speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
-        # if PP disabled then draft will share embed with target
-        if get_pp_group().world_size > 1:
-            self.embed_tokens = VocabParallelEmbedding(
-                self.config.vocab_size,
-                self.config.hidden_size,
-                prefix=maybe_prefix(prefix, "embed_tokens"),
-            )
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
@@ -163,4 +162,4 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
             if "lm_head" not in name:
                 name = "model." + name
             model_weights[name] = loaded_weight
-        return loader.load_weights(model_weights.items())
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index f211bfe54a7d771f6b18def0463b71593455ff69..7fc9fe2ebb6f6897de421b454413497ba04396e2 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
@@ -9,7 +10,6 @@ from transformers import LlamaConfig
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import QKVParallelLinear
@@ -94,13 +94,11 @@ class LlamaModel(nn.Module):
             speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
-        # if PP disabled then draft will share embed with target
-        if get_pp_group().world_size > 1:
-            self.embed_tokens = VocabParallelEmbedding(
-                self.config.vocab_size,
-                self.config.hidden_size,
-                prefix=maybe_prefix(prefix, "embed_tokens"),
-            )
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
@@ -215,6 +213,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         if self.draft_id_to_target_id is None:
+            assert logits.shape[1] == self.config.vocab_size, \
+                "Expected logits to have shape " \
+                f"(*, {self.config.vocab_size}), but got {logits.shape}"
             return logits
 
         base = torch.arange(self.config.draft_vocab_size, device=logits.device)
@@ -234,24 +235,29 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         return self.model.fc(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=None,
-        )
-
         model_weights = {}
+        includes_draft_id_mapping = False
+        includes_embed_tokens = False
         for name, loaded_weight in weights:
             if "t2d" in name:
                 continue
             if "d2t" in name:
                 name = name.replace("d2t", "draft_id_to_target_id")
+                includes_draft_id_mapping = True
             elif "lm_head" not in name:
                 name = "model." + name
+            if "embed_tokens" in name:
+                includes_embed_tokens = True
             model_weights[name] = loaded_weight
 
-        loaded_weights = loader.load_weights(model_weights.items())
-
-        if 'd2t' not in loaded_weights:
-            self.draft_id_to_target_id = None
-
-        return loaded_weights
+        skip_substrs = []
+        if not includes_draft_id_mapping:
+            skip_substrs.append("draft_id_to_target_id")
+        if not includes_embed_tokens:
+            skip_substrs.append("embed_tokens")
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+            skip_substrs=skip_substrs,
+        )
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ced71b6dcdebe361e7b532ef39cbe69167c93daa..725e1b2c1948120ebd344746d5091c98f65894bb 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 2fb79f57a67f12065cb407fab1fbbbedcfb56518..6f5f231875de5f35405aed13f4af846881510652 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 9303ea12172733e0a4a325030063b8b3d33237af..a3406d090db85e08d3475c2668e018c605647004 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 7ea759fd59b828a4ee4901ff024245f913f574e8..d90d3d4a0960d1da4e2a73b31b7dd62df46461f6 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ce76a76b6574351ffa78333da26bd3bede548b4a..8162ac3f7597d3917ee9b5af2a6133d71873b70a 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 858a1633befa0f18c1b0162da2ec03ab6f2a7aee..cf9e1bd03e9860a9a8d603dab6be538028377802 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA2 model."""
 from collections.abc import Iterable
 from typing import Optional
@@ -32,7 +33,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -167,6 +168,27 @@ class Mamba2Model(nn.Module):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
                         SupportsV0Only):
@@ -282,21 +304,5 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 47d0ef9cc6bb1c87c0a9defa0a19079b28947cef..49ba974c69a5ef8e4654da86943be37ea6c7117f 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 95ef1134b1bf9260768b9ec83438e575fff7c585..709a5a993c6f7509b03f4f7c995b570eacacdaf1 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 49ea64e029d63a035f7c9d9956c5b02d6e7bfeb5..9b83f848ef428909f7c95b40f076607810b6f4c7 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index cbca6a4c8f9d2e1ea959f4bfd96e1ba6b2828e11..6066ec76c5fc084429b6c5b85b3b58f7d83f9cb4 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 0397b552ce9f97219689784041f50f8ce2bee913..d398a5d12bbcdbbfa4610ce8e9799d52bc64b370 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
@@ -242,6 +243,7 @@ class MiniCPMAttention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
@@ -444,6 +446,7 @@ class MiniCPMModel(nn.Module):
             for weight_name in ["w1", "w2", "w3"]
         ]
         params_dict = dict(self.named_parameters())
+
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -567,7 +570,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
+                                   inputs_embeds) / self.scale_width
         return hidden_states
 
     def compute_logits(
@@ -575,7 +578,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        hidden_states = hidden_states / self.scale_width
         logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 2a6867d12d9933694374ca3cb7328562f8ce0a03..92c13e81bf3e4a437af6c402c9da5c955b2f1919 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c2eb4e80afb2922b3f7ac5555f74fac234f177
--- /dev/null
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -0,0 +1,391 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only EagleMiniCPM model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
+from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
+from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
+
+
+class EagleMiniCPMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.hidden_size = config.hidden_size
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.max_position_embeddings = getattr(config,
+                                               "max_position_embeddings", 8192)
+        self.prefix = prefix
+        self._init_attn_block()
+        self._init_ffn_block()
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
+        self.self_attn = EagleMiniCPMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            num_kv_heads=self.config.num_key_value_heads,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
+            prefix=f"{self.prefix}.self_attn",
+        )
+
+    def _init_ffn_block(self):
+        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
+                                                eps=self.config.rms_norm_eps)
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        if self.num_experts == 0:
+            self.mlp = EagleMiniCPMMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.config.intermediate_size,
+                hidden_act=self.config.hidden_act,
+                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
+                quant_config=self.quant_config,
+            )
+        else:
+            self.mlp = EagleMiniCPMMoE(
+                num_experts=self.config.num_experts,
+                top_k=self.config.num_experts_per_tok,
+                hidden_size=self.config.hidden_size,
+                intermediate_size=self.config.intermediate_size)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * \
+            (self.config.scale_depth / math.sqrt(self.config.mup_denominator))
+
+        return hidden_states, None
+
+
+@support_torch_compile
+class EagleMiniCPMModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 start_layer: int = 0):
+        super().__init__()
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+        self.input_norm1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self._init_layers(prefix, config, cache_config, quant_config,
+                          start_layer)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], self.config.hidden_size))
+
+    def _init_layers(
+        self,
+        prefix: str,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig],
+        quant_config: Optional[QuantizationConfig],
+        start_layer: int,
+    ):
+        self.eagle_layers = nn.ModuleList([
+            EagleMiniCPMDecoderLayer(
+                config,
+                cache_config,
+                quant_config,
+                f"{prefix}.eagle_layers.{i + start_layer}",
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        return embedding * self.config.scale_emb
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        input_embeds = self.get_input_embeddings(input_ids)
+        input_embeds = self.input_norm1(input_embeds)
+        hidden_states = self.input_norm2(hidden_states)
+
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for layer in self.eagle_layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.prefix = prefix
+        self.vllm_config = vllm_config
+        self.config = config
+        self.lora_config = lora_config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      start_layer=target_layer_num)
+
+        unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self,
+                    *,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    start_layer: int = 0):
+        return EagleMiniCPMModel(vllm_config=vllm_config,
+                                 prefix=prefix,
+                                 start_layer=start_layer)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, hidden_states2 = self.model(input_ids, positions,
+                                                   hidden_states)
+        hidden_states = hidden_states / self.scale_width
+        hidden_states2 = hidden_states2 / self.scale_width
+        return hidden_states, hidden_states2
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ae5df0f9273f6e390281ef3200c3c37881884319..ff5959ed196eacd58f0cd0ed433b845deac3d858 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 04cc7e35e3450b6d8b17ab0cd59c14cc60ef3e95..4100fee0ec841a75727dfb8b94a2f1cabd4328d2 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py
index c95cbb419eb951fb925c54324ca12a44d72b1343..9164ac06a3b0a6d0cb84706a55b7b2716a253d2a 100644
--- a/vllm/model_executor/models/minimax_cache.py
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import torch
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 36bab9ee13b17734a7583214286e55fa9e168d55..02800449bda3c59e17664ee8a5e923640a4e4f7a 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
 import copy
 import math
@@ -141,7 +142,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp):
         head_size: int,
         rotary_dim: int,
         max_position: int,
-        base: int,
+        base: float,
         is_neox_style: bool,
         cache_dtype: torch.dtype,
     ) -> None:
@@ -155,10 +156,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp):
         cache = self._compute_cos_sin_cache().to(cache_dtype)
         self.register_buffer("cos_sin_cache", cache, persistent=False)
 
-    def _compute_inv_freq(
-        self,
-        base: Union[int, float],
-    ) -> torch.Tensor:
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
         inv_freq = 1.0 / (base**(torch.arange(
             0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 14c1250ca3b4297520b57f571e0f74828c828972..b2ededcaf67ce92c3c05d4e754ba19fe572f71f5 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
 from typing import Literal, Optional, TypedDict, Union, cast
 
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 051a73120838eb7f7a1694fcaa379241af214b0f..9147240b2b2a9b98895f673203e0c37eb1bc9047 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9bc7a16153e1fc9750838dc782a1ef039b29ca85..dec365119c725af33a0fc613a38a692ede5e3b15 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 8220200d270c2d94f9295d09dbbc0f365e8bc212..3183c762d2b1416fd74b203a97203fb21ec66747 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 713c9e8d203fabe3bc8bdf66d771f6719390e42d..e9f91feb3359d23df59c08be4d91939caeb34b41 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 7f5c9da2b9305ba2752683f7241f88d4ae482a5f..ccfafba52a8ac6ea011a83bf69a5d85d8deecd13 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
 # All rights reserved.
@@ -34,6 +35,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -49,6 +51,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_vision_model
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -84,23 +87,29 @@ class Llama4ImagePatchInputs(TypedDict):
 
 class Llama4VisionMLP(nn.Module):
 
-    def __init__(self,
-                 input_size: int,
-                 intermediate_size: int,
-                 output_size: int,
-                 bias: bool,
-                 output_activation: bool,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        intermediate_size: int,
+        output_size: int,
+        bias: bool,
+        output_activation: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
-        self.fc1 = ColumnParallelLinear(
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(
             input_size=input_size,
             output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
         )
-        self.fc2 = RowParallelLinear(
+        cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear
+        self.fc2 = cls_fc2(
             input_size=intermediate_size,
             output_size=output_size,
             bias=bias,
@@ -155,10 +164,12 @@ def pixel_shuffle(input_tensor, shuffle_ratio):
                                         int(channels / shuffle_ratio))
     reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
 
-    reshaped_tensor = reshaped_tensor.view(batch_size,
-                                           int(height * shuffle_ratio),
-                                           int(width * shuffle_ratio),
-                                           int(channels / (shuffle_ratio**2)))
+    reshaped_tensor = reshaped_tensor.view(
+        batch_size,
+        int(height * shuffle_ratio),
+        int(width * shuffle_ratio),
+        int(channels / (shuffle_ratio**2)),
+    )
     reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
 
     output_tensor = reshaped_tensor.view(batch_size, -1,
@@ -173,6 +184,7 @@ class Llama4VisionPixelShuffleMLP(nn.Module):
         config,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
@@ -186,7 +198,9 @@ class Llama4VisionPixelShuffleMLP(nn.Module):
             bias=config.multi_modal_projector_bias,
             output_activation=True,
             quant_config=quant_config,
-            prefix=f"{prefix}.mlp")
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
         encoded_patches = pixel_shuffle(encoded_patches,
@@ -201,10 +215,12 @@ class Llama4VisionAttention(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // self.num_heads
@@ -217,22 +233,39 @@ class Llama4VisionAttention(nn.Module):
 
         self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
                                        self.scaling)
-        self.qkv_proj = QKVParallelLinear(
-            self.embed_dim,
-            self.head_dim,
-            self.num_heads,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-        self.o_proj = RowParallelLinear(
-            self.num_heads * self.head_dim,
-            self.embed_dim,
-            bias=True,
-            input_is_parallel=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.o_proj",
-        )
+
+        if use_data_parallel:
+            self.qkv_proj = ReplicatedLinear(
+                self.embed_dim,
+                self.q_size + 2 * self.kv_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.o_proj = ReplicatedLinear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                self.num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.o_proj = RowParallelLinear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                input_is_parallel=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj",
+            )
 
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
@@ -275,22 +308,29 @@ class Llama4VisionEncoderLayer(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
         self.intermediate_size = config.intermediate_size
 
-        self.self_attn = Llama4VisionAttention(config,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.self_attn")
-        self.mlp = Llama4VisionMLP(input_size=config.hidden_size,
-                                   intermediate_size=config.intermediate_size,
-                                   output_size=config.hidden_size,
-                                   bias=True,
-                                   output_activation=False,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp")
+        self.self_attn = Llama4VisionAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = Llama4VisionMLP(
+            input_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            output_activation=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
 
         self.input_layernorm = nn.LayerNorm(config.hidden_size)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
@@ -322,6 +362,7 @@ class Llama4VisionEncoder(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
@@ -330,6 +371,7 @@ class Llama4VisionEncoder(nn.Module):
                 config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.layers.{layer_idx}",
+                use_data_parallel=use_data_parallel,
             ) for layer_idx in range(config.num_hidden_layers)
         ])
 
@@ -357,23 +399,33 @@ class Llama4VisionEncoder(nn.Module):
 
 class Llama4UnfoldConvolution(nn.Module):
 
-    def __init__(self,
-                 config: Llama4VisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         kernel_size = config.patch_size
         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size)
         self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
                                       stride=config.patch_size)
-        self.linear = ColumnParallelLinear(config.num_channels *
-                                           kernel_size[0] * kernel_size[1],
-                                           config.hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           gather_output=True,
-                                           prefix=f"{prefix}.linear")
+        params = {
+            "input_size":
+            config.num_channels * kernel_size[0] * kernel_size[1],
+            "output_size": config.hidden_size,
+            "bias": False,
+            "quant_config": quant_config,
+            "prefix": f"{prefix}.linear",
+        }
+        if use_data_parallel:
+            cls = ReplicatedLinear
+        else:
+            cls = ColumnParallelLinear
+            params["gather_output"] = True
+        self.linear = cls(**params)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.unfold(hidden_states)
@@ -389,6 +441,7 @@ class Llama4VisionModel(nn.Module):
         config: Llama4VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
@@ -403,7 +456,9 @@ class Llama4VisionModel(nn.Module):
         self.patch_embedding = Llama4UnfoldConvolution(
             config,
             quant_config=quant_config,
-            prefix=f"{prefix}.patch_embedding")
+            prefix=f"{prefix}.patch_embedding",
+            use_data_parallel=use_data_parallel,
+        )
 
         self.class_embedding = nn.Parameter(self.scale *
                                             torch.randn(self.hidden_size))
@@ -415,11 +470,18 @@ class Llama4VisionModel(nn.Module):
         self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
 
         # encoders
-        self.model = Llama4VisionEncoder(config,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.model")
+        self.model = Llama4VisionEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.model",
+            use_data_parallel=use_data_parallel,
+        )
         self.vision_adapter = Llama4VisionPixelShuffleMLP(
-            config, quant_config, prefix=f"{prefix}.vision_adapter")
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_adapter",
+            use_data_parallel=use_data_parallel,
+        )
 
     def forward(
         self,
@@ -529,8 +591,9 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         vision_config = self.info.get_hf_config().vision_config
 
         if processed_outputs.get("pixel_values") is not None:
-            assert "images" in mm_data, \
-                "images expected to be in mm_data when pixel_values is present"
+            assert (
+                "images" in mm_data
+            ), "images expected to be in mm_data when pixel_values is present"
 
             images = mm_data["images"]
             parsed_images = (self._get_data_parser().parse_mm_data({
@@ -547,8 +610,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                 get_best_fit(
                     (image.size[1], image.size[0]),
                     torch.tensor(possible_resolutions),
-                    resize_to_max_canvas=image_processor.resize_to_max_canvas)
-                for image in parsed_images
+                    resize_to_max_canvas=image_processor.resize_to_max_canvas,
+                ) for image in parsed_images
             ]
             # TODO tile height/width do not necessarily need to match
             aspect_ratios = [(image_size[0] // tile_size,
@@ -660,13 +723,17 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
-        self.vision_model = Llama4VisionModel(config.vision_config,
-                                              None,
-                                              prefix=maybe_prefix(
-                                                  prefix, "vision_model"))
+        self.vision_model = Llama4VisionModel(
+            config.vision_config,
+            None,
+            prefix=maybe_prefix(prefix, "vision_model"),
+            use_data_parallel=self.use_data_parallel,
+        )
         self.multi_modal_projector = Llama4MultiModalProjector(
             self.config,
             None,
@@ -710,7 +777,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         flat_data = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"].tolist()
 
-        vision_embeddings_flat = self.vision_model(flat_data)
+        # shard image input
+        if self.use_data_parallel:
+            vision_embeddings_flat = run_dp_sharded_vision_model(
+                flat_data, self.vision_model)
+        else:
+            vision_embeddings_flat = self.vision_model(flat_data)
+
         vision_embeddings_flat = self.multi_modal_projector(
             vision_embeddings_flat)
 
@@ -797,6 +870,30 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return get_prefix_weights(), get_other_weights()
 
+    def _consolidate_qkv_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        qkv_idx_mappings = {
+            ".self_attn.q_proj": 0,
+            ".self_attn.k_proj": 1,
+            ".self_attn.v_proj": 2,
+        }
+        qkv_weights = {}
+        for name, loaded_weight in weights:
+            for weight_name, idx in qkv_idx_mappings.items():
+                if weight_name not in name:
+                    continue
+                new_name = name.replace(weight_name, ".self_attn.qkv_proj")
+                if new_name not in qkv_weights:
+                    qkv_weights[new_name] = [None] * 3
+                qkv_weights[new_name][idx] = loaded_weight
+                break
+            else:
+                yield name, loaded_weight
+        for key, weight in qkv_weights.items():
+            qkv_weight = torch.cat(weight, dim=0)
+            yield key, qkv_weight
+
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
 
@@ -819,9 +916,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         assert loaded_language_model_params is not None
         updated_params.update(loaded_language_model_params)
 
+        if self.use_data_parallel:
+            other_weights = self._consolidate_qkv_weights(other_weights)
+
         for name, loaded_weight in other_weights:
             for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
+                if weight_name not in name or self.use_data_parallel:
                     continue
                 name = name.replace(weight_name, param_name)
                 param = params_dict[name]
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index a7d7aa7d44ef28d3b47b89b73dde3a591c9877a5..c6a97388dc1880cd89e9dc6e24291aad990981b9 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 86552aa05bf951c20a49f5bf789b1774d0340a5f..35f416a6e21e8734176aeb24c54fb6b284075751 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Optional
 
@@ -12,7 +13,7 @@ from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.pooler import ClassifierPooler
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -278,8 +279,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
         self.model = ModernBertModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "modernbert"))
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self._pooler = CrossEncodingPooler(config, self.classifier,
-                                           ModernBertPooler(config))
+        self._pooler = ClassifierPooler(vllm_config.model_config,
+                                        self.classifier,
+                                        ModernBertPooler(config))
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 25e6f594069ef3abded48e8901efa108d7c3354f..11a2a384c165e574bfd60fe6d534062512e280bd 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 640a2049a629358a9dd01e96df1ef6f9a8dba84f..1fa76b9ac7afadf12e348527cba254fd0a45656e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 9f11d4a42273370b9b51e3a57ea0f00857de3ba3..d0fdab13ef0c95921bb52c06d3c3eb569795477e 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
 # This file is meant to be used in kimi_vl.py only
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 6c396d778ae71b39b7692ed1ca48d283cab1b517..0878ada34d1d8a472e52934378d257693cc59549 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index d0999e30e1ba47c6049e2d0857714d43ade07f2f..eabf47b1aede4c5609978c2b1dd9cdd15ad27e27 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..3424efa80d48f421983a5fdbb76781352f52751f
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -0,0 +1,573 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/vllm-project/vllm/blob/94d8ec8d2bcb4ec55e33022b313c7e978edf05e1/vllm/model_executor/models/bamba.py
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only NemotronH model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsLoRA, SupportsPP,
+                                                   SupportsQuant,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader, make_empty_intermediate_tensors_factory, make_layers,
+    maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import NemotronHConfig
+from vllm.utils import LayerBlockType
+
+
+class NemotronHMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronHMLPDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.mixer = NemotronHMLP(
+            config,
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronHMambaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.mixer = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.ssm_state_size,
+            conv_kernel_size=config.conv_kernel,
+            intermediate_size=config.expand * config.hidden_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            n_groups=config.n_groups,
+            num_heads=config.mamba_num_heads,
+            head_dim=config.mamba_head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.mamba_hidden_act,
+            quant_config=quant_config,
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states, mamba_cache_params,
+                                   mamba2_metadata)
+        return hidden_states, residual
+
+
+class NemotronHAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronHAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.mixer = NemotronHAttention(
+            config,
+            layer_idx,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states=hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "M": NemotronHMambaDecoderLayer,
+    "-": NemotronHMLPDecoderLayer,
+    "*": NemotronHAttentionDecoderLayer,
+}
+
+
+class NemotronHModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: NemotronHConfig = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.hybrid_override_pattern[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            len(config.hybrid_override_pattern),
+            get_layer,
+            prefix=f"{prefix}.layers")
+        self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size)
+
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            attn_metadata=attn_metadata,
+        )
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        residual = None
+        num_non_mamba_layers = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            layer_mamba_cache_params = None
+            if isinstance(layer, NemotronHMambaDecoderLayer):
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_non_mamba_layers)
+            else:
+                num_non_mamba_layers += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        attb_params_mapping = {
+            "q_proj": "q",
+            "k_proj": "k",
+            "v_proj": "v",
+        }
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "embeddings" in name:
+                name = name.replace("embeddings", "embed_tokens")
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "D" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "dt_bias" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            # load attn params
+            if any(proj in name for proj in ["q_proj", "k_proj", "v_proj"]):
+                weight_name = next(proj
+                                   for proj in ["q_proj", "k_proj", "v_proj"]
+                                   if proj in name)
+                name = name.replace(weight_name, "qkv_proj")
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight,
+                              attb_params_mapping[weight_name])
+            # load other params
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                           IsHybrid, SupportsV0Only, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "NemotronH currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = NemotronHModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intmd_tensors = (self.model.make_empty_intmd_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+
+        conv_state_shape, temporal_state_shape = None, None
+
+        intermediate_size = self.config.expand * hidden_size
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = (
+            self.config.n_groups +
+            extra_groups_for_head_shards(self.config.n_groups, world_size))
+
+        # - heads and n_groups are TP-ed
+        conv_dim = (intermediate_size +
+                    2 * n_groups * self.config.ssm_state_size)
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.conv_kernel - 1,
+        )
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(self.config.mamba_num_heads, world_size),
+            self.config.mamba_head_dim,
+            self.config.ssm_state_size,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # update name in weights before passing to loader
+        updated_weights = []
+        for name, loaded_weight in weights:
+            name = name.replace("backbone", "model")
+            updated_weights.append((name, loaded_weight))
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(updated_weights)
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 9808fe05558e2636cc70e7274fedb7c6090d6d2e..a766ed9476a657dba04e9d29f694b0f0730e9959 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 172434e66ae2c4a406fca1380c835781bca03e71..2f7f8e437f0adf760e67e8a988ccf412a1948340 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index fcb7c619a10206d294cd07507eb826703ec85981..1dc4df85c1bc4016225263ac0da531bc6f9e6325 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 33adacdae5f5bbf0be7456ee53493c95716b02b4..499e6d30ed6b0834b3360b1a77a864b11f2c6d7d 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 6364b89fb8379c36be2f9ce303c0c5b3960febb0..ebfdb690fe29bfad201a74c96283275e7db87a03 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +14,7 @@
 # limitations under the License.
 """Inference-only OLMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from functools import partial
 from typing import Any, Optional, Union
 
 import torch
@@ -22,7 +24,10 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
+from vllm.distributed.utils import split_tensor_along_last_dim
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -140,8 +145,11 @@ class OlmoeAttention(nn.Module):
             bias=False,
             quant_config=quant_config,
         )
-        self.q_norm = RMSNorm(hidden_size, eps=1e-5)
-        self.k_norm = RMSNorm(hidden_size, eps=1e-5)
+        self.tp_size = tp_size
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.q_norm = RMSNorm(self.total_num_heads * self.head_dim, eps=1e-5)
+        self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
+                              eps=1e-5)
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
@@ -165,6 +173,20 @@ class OlmoeAttention(nn.Module):
                               quant_config=quant_config,
                               prefix=f"{prefix}.attn")
 
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -172,7 +194,7 @@ class OlmoeAttention(nn.Module):
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous())
+        q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 8376d62410d4b1cba19edf62297e3b4ff86d1d30..9eaac1e28dcd87eb09a94149c938c690be754132 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index da2a194e6bdf451cab868f24def3aa3921d2d80d..d121188ba5d4a91ca435274f6a0b74b590e84b15 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index e03705d48f3e88884917f5ec7d9d5abb7bb4a3e3..5c11d54c61247e552cc1e47837d421ad9afca2af 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py
 # Copyright 2023 The vLLM team.
@@ -30,6 +31,9 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.model_executor.models.aimv2 import AIMv2Model
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
@@ -48,7 +52,7 @@ from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
                                                   OvisConfig)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import merge_multimodal_embeddings
 
 # Cannot find the following number from hf config.
@@ -106,12 +110,14 @@ class VisualTokenizer(torch.nn.Module):
         config: BaseVisualTokenizerConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
-    ):
+    ) -> nn.Module:
         model_type = config.backbone_config.model_type
         if model_type == "aimv2":
+            # No post rms_norm in Ovis2's AIMv2 ViT.
             return AIMv2Model(
                 config=config.backbone_config,
                 quant_config=quant_config,
+                require_post_norm=False,
                 prefix=prefix,
             )
         elif model_type == "siglip_vision_model":
@@ -124,14 +130,14 @@ class VisualTokenizer(torch.nn.Module):
             f"Unsupported visual tokenizer model_type: {model_type}")
 
     @property
-    def dtype(self):
+    def dtype(self) -> torch.dtype:
         return next(self.head.parameters()).dtype
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return next(self.head.parameters()).device
 
-    def tokenize(self, logits):
+    def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
         if self.config.tokenize_function == 'softmax':
             tokens = softmax(logits, dim=-1)
         elif self.config.tokenize_function == 'gumbel_argmax':
@@ -144,7 +150,7 @@ class VisualTokenizer(torch.nn.Module):
                 f'or st_argmax, but got {self.config.tokenize_function}')
         return tokens
 
-    def encode(self, pixel_values):
+    def encode(self, pixel_values: torch.Tensor) -> torch.Tensor:
         features = self.backbone(pixel_values)
         if self.config.drop_cls_token:
             features = features[:, 1:, :]
@@ -395,7 +401,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
 @MULTIMODAL_REGISTRY.register_processor(OvisMultiModalProcessor,
                                         info=OvisProcessingInfo,
                                         dummy_inputs=OvisDummyInputsBuilder)
-class Ovis(nn.Module, SupportsMultiModal):
+class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -410,7 +416,7 @@ class Ovis(nn.Module, SupportsMultiModal):
 
         self.visual_tokenizer = VisualTokenizer(
             config=config.visual_tokenizer_config,
-            quant_config=quant_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=f"{prefix}.visual_tokenizer",
         )
 
@@ -421,9 +427,16 @@ class Ovis(nn.Module, SupportsMultiModal):
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
 
-        # TODO(Isotr0py): PP support
-        # self.make_empty_intermediate_tensors = (
-        #    self.language_model.make_empty_intermediate_tensors)
+        self.make_empty_intermediate_tensors = (
+            self.get_language_model().make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/AIDC-AI/Ovis2-2B-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 427005e9b70410adb4f11f12c863b4d352cc5c6b..a0e2912578c51fabf4fe8a3db764f08f7581226e 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index d46b95fea5a8af1fcaca07aaf9095e5fec0eb593..f8db99eb92ba87b64147b5c985e7a74200f435be 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 330ad5c59448bac61eced59ed02bb41575a1a53c..21d517b3a490fc958f05c10bd88b5cacd2c6ce2d 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 8f84e0726951d063f96b621d778fb9afeb25fce5..f4e870c530309eb96012b78ed84e706abe4dfc1a 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index d00d7d886d671b8f8266d90c18d7374c9fac5f00..533655fd52004b0c006b5990b67e7a1cb1f0b96b 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b757e661d771204e208e08753844ceca135bba5e..376c53d2cb99afb7650a70530e29a091af8492ef 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 418ff900ffd52fa84cc8f530fd3df9798d073ead..924e6436897d4c13129b7238065bf8d4005b3c7f 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict, Union
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index 98cef75069ae2af271bbc2bfd6c1faaf312e65cd..ae7a8a732c446a633603a947de66a505d71fe2b5 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index f468fdbd5417fe18872b5f3fa6fafe99ac40eefe..c4890d8427e2a2df07c72c324ec9291b2d58e520 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index d9917c26d1b129835bf78e6ee6c3bea1f3cf438c..dddd19c7462be9e7066790c28a6e0f1f085b1b68 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9f28d4cef4251e35c6aba8bb0817cafac0cb4299..705586b6a6ea64a69f91198810b52555a948c00b 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 55a65f8078a4d12da3c22f38eded311ec5b04920..670576c68efdd4bb3191dcd6a60e2d0a9b3ebdec 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 40ac5e30a368ba33ab858993627f6e2cacbc7ab9..4fdcae5de644a05aad4ef7ee749777c56153e446 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2025 The vLLM team.
 # Copyright 2025 IBM.
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 2fda87a4ff0f6c4ff956b983cad8d4da286c5e97..e804f03e014e15849b7780c173e28016244e3f54 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 0d0d98c59dbc7cf3b614f2f637e832d15a982c91..23f65b99c22cec18cea9d93ba1b386c5e160c33f 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
@@ -34,32 +35,27 @@ from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
-logger = init_logger(__name__)
-
 
 class Qwen2MLP(nn.Module):
 
@@ -499,69 +495,3 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
-
-
-class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.config = config
-        self.lora_config = lora_config
-
-        self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-
-        # TODO: Replace this model class with as_embedding_model(
-        # Qwen2ForCausalLM) after changing the default pooling method
-        if pooler_config.pooling_type is None:
-            logger.warning(
-                "This embedding model will default to last-token pooling in "
-                "an upcoming version. To avoid breaking changes, you should "
-                "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`"
-                " explicitly.")
-
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.MEAN,
-            normalize=True,
-            softmax=False)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> torch.Tensor:
-        return self.model(input_ids, positions, intermediate_tensors)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        weights = self.hf_to_vllm_mapper.apply(weights)
-        weights = ((name, data) for name, data in weights
-                   if not name.startswith("lm_head."))
-        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index d89b822dd8739a6ea323f7f0d9df4ae439a2c33c..7172394e420053e496b90bb828cf970389949526 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 68dd07820189e271bfaf37c323f238358823c878..7770ec711ce7809451b5d4b66324bf59a7ccb5f5 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -821,23 +822,17 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
     dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
 class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
 
     # To ensure correct weight loading and mapping.
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "lm_head.": "language_model.lm_head.",
-        "model.": "language_model.model.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3182a753257878dc4024232eb68b941b1c80d772..6951630c6f2317e3cc6b6030a2054704fcc77015 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 143b9f98b02938e509567a1a8390a2f4a7a3c3f0..a2c65f4b5edb4ec4bf87552d26f2e97dfb2c9f49 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 81dc38988c9d9dec0d2c5f0b81f20a8696a58ff6..76d7ecdd1272b0a78161f04152fe0aa560b18161 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0ff0836b089758447dc8b1c90eb5a6cda93aa546..a4f8a361ec7105279bd05f94d926b85aa3ae1095 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1069,23 +1070,17 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                         dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
 
     # To ensure correct weight loading and mapping.
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "lm_head.": "language_model.lm_head.",
-        "model.": "language_model.model.",
-    })
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index dbe2be8a73d5949a3cf33b55cc9ec8b2d18871ee..393ce41a91a00a6affce5ed08e03ded0e4a09abc 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8a4c2850dda3ad57cef5b9aaaf463433a2df2a58..823197fc935036eb7aa327cf1c2fa2e4ccccc761 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index f5d242fdf1c26fb83834df623ac6466df0ddc406..e828ce9c98499dd6546c709d72b44d44dba6a4e9 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 854eea8a43f6794efedc580642f269f1a0155db0..43fc382f18680ed5d3e3111095d1d315cb94ecb1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
@@ -92,6 +93,7 @@ _TEXT_GENERATION_MODELS = {
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
+    "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
@@ -143,7 +145,7 @@ _EMBEDDING_MODELS = {
     "ModernBertModel": ("modernbert", "ModernBertModel"),
     "NomicBertModel": ("bert_with_rope", "NomicBertModel"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
-    "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
+    "Qwen2Model": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
@@ -212,6 +214,7 @@ _MULTIMODAL_MODELS = {
     "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
+    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
@@ -224,6 +227,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"),
     "EAGLEModel": ("eagle", "EAGLE"),
     "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
+    "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 9a4d0ab2dd4d7dce5b76a893326c5eac8ab330e0..8fa8b89798d00ee4ef524f15c83b3f2cb876e307 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable
@@ -9,7 +10,7 @@ from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.pooler import ClassifierPooler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -186,7 +187,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
                                  embedding_class=RobertaEmbedding,
                                  add_pooling_layer=False)
         self.classifier = RobertaClassificationHead(config)
-        self._pooler = CrossEncodingPooler(config, self.classifier)
+
+        self._pooler = ClassifierPooler(vllm_config.model_config,
+                                        self.classifier)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         bert_weights, task_weights = roberta_task_weights_filter(weights)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 3b5334afa7af829240a73f6162399aa1774aaeb6..3630f59f53e0a6a2551fac5f6646ccf4b46fa32b 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Implementation of SiglipVisionModel intended to be only used
 within a vision language model."""
 
@@ -130,11 +131,10 @@ class SiglipVisionEmbeddings(nn.Module):
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
 
         if interpolate_pos_encoding:
-            embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings += self.interpolate_pos_encoding(
                 embeddings, height, width)
         else:
-            embeddings = embeddings + self.position_embedding(
-                self.position_ids)
+            embeddings += self.position_embedding(self.position_ids)
         return embeddings
 
 
@@ -271,12 +271,12 @@ class SiglipEncoderLayer(nn.Module):
 
         hidden_states = self.layer_norm1(hidden_states)
         hidden_states, _ = self.self_attn(hidden_states=hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
 
         return hidden_states, None
 
@@ -354,7 +354,8 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
 
         residual = hidden_state
         hidden_state = self.layernorm(hidden_state)
-        hidden_state = residual + self.mlp(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state += residual
 
         return hidden_state[:, 0]
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index eefadda918f62b9dca9aa94949024c0643371a00..08c47facad9742aa70e636d092a5ab8271ba8321 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 31dec55026baec47a629424db9217515c5da2628..0f22ba5b406ce11c4f19d99290d7425a8caeb0c1 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index fcd17cc1c2ba43e92db59a3e8f501ca649a32213..8dd52f1d204a581cfd5330613cecda2f7379d7b6 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 86ce813ddf3dd13a763e401e8205973b2fb2e407..d6ec743ce845e30db03a1fb1857541299ffaf2b9 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index f4ba5a8030e52bf8943b7856b92e38060243ad4e..9d9a2bff0e43fc6e43175b2b9f03b3f10b710e2b 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aa3ddabc19ecdc4268a9adbdfa6c69a35653a58
--- /dev/null
+++ b/vllm/model_executor/models/tarsier.py
@@ -0,0 +1,643 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+                    Union, cast)
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, CLIPVisionConfig
+from transformers import LlavaConfig as HfLlavaConfig
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.models.llava import LlavaProcessor
+from transformers.processing_utils import (ProcessingKwargs, Unpack,
+                                           _validate_images_text_input_order)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import VisionEncoderInfo, get_vision_encoder_info
+
+
+class TarsierImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+
+
+class TarsierImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+
+
+TarsierImageInputs = Union[TarsierImagePixelInputs,
+                           TarsierImageEmbeddingInputs]
+
+
+class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
+    vision_config: Final[PretrainedConfig]
+    text_config: Final[PretrainedConfig]  # Added from Tarsier's LlavaConfig
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+    projector_hidden_act: Final[str]
+    image_newline_idx: Final[int]
+    image_new_idx: Final[int]
+    multimodal_projector_bias: bool = True
+
+
+class TarsierProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+class TarsierProcessor(LlavaProcessor):
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[TarsierProcessorKwargs],
+    ) -> BatchFeature:
+        if images is None and text is None:
+            raise ValueError(
+                "You have to specify at least one of `images` or `text`.")
+
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            TarsierProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(
+                images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string,"
+                             " or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size +
+                1) + self.num_additional_image_tokens + 1
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token,
+                                        self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop(
+            "return_tensors", None)
+        text_inputs = self.tokenizer(prompt_strings,
+                                     **output_kwargs["text_kwargs"])
+        return BatchFeature(data={
+            **text_inputs,
+            **image_inputs
+        },
+                            tensor_type=return_tensors)
+
+
+class TarsierMultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class TarsierProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> TarsierHfConfig:
+        return self.ctx.get_hf_config(HfLlavaConfig)
+
+    def get_vision_encoder_info(self) -> VisionEncoderInfo:
+        return get_vision_encoder_info(self.get_hf_config())
+
+    def get_hf_processor(self, **kwargs: object) -> TarsierProcessor:
+        hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
+        # Patch for patch_size if needed (copied from vLLM LLaVA)
+        if hasattr(hf_processor,
+                   'patch_size') and hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+        return hf_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def _apply_feature_select_strategy(
+        self,
+        strategy: str,
+        encoder_num_image_tokens: int,
+    ) -> int:
+        if strategy == "default":
+            return encoder_num_image_tokens - 1
+        if strategy == "full":
+            return encoder_num_image_tokens
+        msg = f"Unexpected feature select strategy: {strategy!r}"
+        raise NotImplementedError(msg)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
+        num_projected_patches = self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+        if num_projected_patches <= 0:
+            default_size = self.get_image_size_with_most_features()
+            num_projected_patches_default = self._apply_feature_select_strategy(
+                hf_config.vision_feature_select_strategy,
+                vision_encoder_info.get_num_image_tokens(
+                    image_width=default_size.width,
+                    image_height=default_size.height,
+                ),
+            )
+            if num_projected_patches_default <= 0:
+                raise ValueError(
+                    "Could not determine a valid number of image patches.")
+            num_projected_patches = num_projected_patches_default
+        num_height_patches = int(math.sqrt(num_projected_patches))
+        total_image_tokens_for_llm = num_projected_patches \
+            + num_height_patches + 1
+        return total_image_tokens_for_llm
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_image_newline_idx(self) -> int:
+        return self.get_hf_config().image_newline_idx
+
+    def get_image_new_idx(self) -> int:
+        return self.get_hf_config().image_new_idx
+
+
+_I_Tarsier = TypeVar("_I_Tarsier", bound=TarsierProcessingInfo)
+
+
+class TarsierDummyInputsBuilder(LlavaDummyInputsBuilder[_I_Tarsier]):
+
+    pass
+
+
+class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index  # The <IMAGE> token ID
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_projected_patches = images.get_feature_size(item_idx)
+                # This assumes num_projected_patches is a perfect square
+                num_height_patches = int(math.sqrt(num_projected_patches))
+                num_final_image_tokens = num_projected_patches \
+                + num_height_patches + 1
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_final_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_final_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],  # Replace each single <IMAGE> token
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_tarsier_hf_info(
+        ctx: InputProcessingContext) -> TarsierProcessingInfo:
+    return TarsierProcessingInfo(ctx)
+
+
+def _build_tarsier_hf_processor(
+    info: _I_Tarsier,
+    dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
+    *,
+    cache: Optional[ProcessingCache] = None,
+) -> BaseMultiModalProcessor:
+    if isinstance(info, TarsierProcessingInfo):
+        return TarsierMultiModalProcessor(
+            info,
+            dummy_inputs,
+            cache=cache,
+        )
+    raise NotImplementedError(type(info))
+
+
+def init_vision_tower_for_tarsier(
+    hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> Union[CLIPVisionModel, SiglipVisionModel]:
+    vision_config = hf_config.vision_config
+
+    feature_layers = hf_config.vision_feature_layer
+    base_num_hidden_layers = vision_config.num_hidden_layers
+
+    def _get_layer_index(feature_layer_index: int,
+                         num_hidden_layers_total: int) -> int:
+        if feature_layer_index < 0:
+            return num_hidden_layers_total + feature_layer_index + 1
+        return feature_layer_index
+
+    if isinstance(feature_layers, int):
+        num_hidden_layers_to_init = _get_layer_index(feature_layers,
+                                                     base_num_hidden_layers)
+    elif isinstance(feature_layers, (list, tuple)):
+        num_hidden_layers_to_init = max(
+            _get_layer_index(idx, base_num_hidden_layers)
+            for idx in feature_layers)
+    else:
+        raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                        " is not supported")
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_to_init,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_to_init,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+
+    msg = f"Unsupported vision config for Tarsier: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_processor(_build_tarsier_hf_processor,
+                                        info=_build_tarsier_hf_info,
+                                        dummy_inputs=TarsierDummyInputsBuilder)
+class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config: TarsierHfConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config  # Storing the Tarsier-specific HF config
+        self.vision_tower = init_vision_tower_for_tarsier(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        projector_bias = getattr(config, "multimodal_projector_bias", True)
+
+        self.multi_modal_projector = TarsierMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.
+            text_config,  # Use text_config from Tarsier's main config
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.register_buffer('image_newline_idx_tensor',
+                             torch.tensor([config.image_newline_idx],
+                                          dtype=torch.long),
+                             persistent=False)
+        self.register_buffer('image_new_idx_tensor',
+                             torch.tensor([config.image_new_idx],
+                                          dtype=torch.long),
+                             persistent=False)
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)  # Assuming 3 channels
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[TarsierImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return TarsierImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return TarsierImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        # From vLLM LLaVA, vision tower output handling
+        image_hidden_states = vision_tower(pixel_values)
+        if not isinstance(image_hidden_states, torch.Tensor):
+            raise TypeError(
+                f"image_hidden_states type: {type(image_hidden_states)}"
+                " is not supported")
+
+        def select_features_fn(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        selected_features = cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features_fn, image_hidden_states),
+        )
+        return selected_features
+
+    def _add_tarsier_split_tokens(
+            self, projected_image_features: torch.Tensor) -> torch.Tensor:
+        """
+        Implements Tarsier's `add_split_tokens` logic.
+        """
+        num_images, num_projected_patches, embed_dim = \
+            projected_image_features.shape
+        num_height_patches = int(math.sqrt(num_projected_patches))
+        num_width_patches = num_projected_patches // num_height_patches
+        device = projected_image_features.device
+        embedding_layer = self.language_model.model.embed_tokens
+        image_newline_emb = embedding_layer(
+            self.image_newline_idx_tensor.to(device)).squeeze(0)
+        image_new_emb = embedding_layer(
+            self.image_new_idx_tensor.to(device)).squeeze(0)
+        try:
+            current_image_features_grid = projected_image_features.view(
+                num_images, num_height_patches, num_width_patches, embed_dim)
+        except RuntimeError as e:
+            raise RuntimeError(
+                "Cannot reshape projected_image_features"
+                f" with shape {projected_image_features.shape} "
+                f"to ({num_images}, {num_height_patches},"
+                f" {num_width_patches}, {embed_dim}). "
+                "Ensure num_projected_patches is compatible"
+                " with a grid structure. "
+                f"num_projected_patches={num_projected_patches}, "
+                f"derived num_height_patches={num_height_patches}. ") from e
+
+        image_newline_expanded = image_newline_emb.expand(
+            (num_images, num_height_patches, 1, embed_dim))
+        features_with_newlines = torch.cat(
+            [current_image_features_grid, image_newline_expanded],
+            dim=2  # Concatenate along width dim
+        )
+        new_num_patches_after_newline = num_projected_patches \
+            + num_height_patches
+        features_with_newlines_flat = features_with_newlines.view(
+            num_images, new_num_patches_after_newline, embed_dim)
+        image_new_expanded = image_new_emb.expand((num_images, 1, embed_dim))
+        final_image_features = torch.cat(
+            [features_with_newlines_flat, image_new_expanded],
+            dim=1  # Concatenate along patch sequence dim
+        )
+        return final_image_features
+
+    def _process_image_pixels(
+        self,
+        inputs: TarsierImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+        pixel_values = inputs["pixel_values"]
+        image_features_selected = self._image_pixels_to_features(
+            self.vision_tower, pixel_values)  # type: ignore
+        if isinstance(image_features_selected, torch.Tensor):
+            projected_features = self.multi_modal_projector(
+                image_features_selected)
+            final_features = self._add_tarsier_split_tokens(projected_features)
+            return final_features
+        else:
+            raise TypeError(
+                f"_image_pixels_to_features type:"
+                f" {type(image_features_selected)} is not supported")
+
+    def _process_image_input(
+        self,
+        image_input: TarsierImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            projected_features = image_input["data"]
+            if isinstance(projected_features, torch.Tensor):
+                return self._add_tarsier_split_tokens(projected_features)
+            else:
+                raise ValueError("Incorrect type of image_embeds. "
+                                 f"Got type: {type(projected_features)}. ")
+        assert self.vision_tower is not None
+        return self._process_image_pixels(image_input)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        return self._process_image_input(image_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 7d713d23c772dc30b34febc5ab6cc2d6bd6af967..f0b31b1332fb1ee7d941b958055caa4375a8df82 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index e05f23f99e97920c9c1d2a9508fd922ab9605c58..3666f7011a9979082c8a8fc4d28c472439b01b17 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index b87a2ebf211acc5f128449e2e2231b9756760432..2f78d9d4cc06501a157f18b3d21d0e1f64079839 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 #
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index c1a4dc1b33d7857d53ef215f689aef6be8b5fe17..43836f2956c3b35778e913580f73eebb3df4de1d 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 3d821d3dc6b5825c6e3c470aeba345bdafe3f129..aa88f4210160519c424f9466ae53e6f8c77ff7fe 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable, Mapping
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 901d83ec5b9e64044cc242331e0def94d1fa6142..ac6a659bbaa3240446e19859328c54d314f90bab 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c6e303d6024a4e3b5741f9f405fbef0785dba24f..3ee5f7dba01f093588d4b6fb70671705a58a1136 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 48e254bdd85bd61096e6af2f885ef9865f30a7e3..a4f97c774f70680f0050b395612647f55284fb9f 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch Zamba2 model implementation for vLLM.
 
 This module implements the Zamba2 architecture from 
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 34a0b527b585ed55c34e0954f3231e780fee27c8..750ee785026880fdb01349c2fa8931e333716e54 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
 from typing import Callable, Optional, Union
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index 4c5db7396c03c0ce1e7b536e668296790751ea81..4dd443bc26ea0aaa4449fe22ac96a523c37cc590 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 6b83a59b598863987f639363ecfb250e4f13d19e..56f0f0984bfa0ec9a3054c9a9d7b351355cfe2e7 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from dataclasses import dataclass
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index f9d89e64bd9db15c9879084f1f793a2dfa6f44ff..cbaa34bfc30b2126bf1b35003ef97c27fc50f34b 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
+import copy
 from typing import Any, Optional
 
 import torch
@@ -48,6 +50,28 @@ def _make_synced_weight_loader(original_weight_loader):
 
     def _synced_weight_loader(param, *args, **kwargs):
         original_weight_loader(param, *args, **kwargs)
-        torch._sync(param)
+        # torch._sync doesn't support, is not needed for CPU tensors.
+        if param.device != torch.device("cpu"):
+            torch._sync(param)
 
     return _synced_weight_loader
+
+
+def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
+    parent_map = copy.deepcopy(getattr(model, "packed_modules_mapping", {}))
+
+    # don't infer mapping if the model has defined it explicitly.
+    if parent_map:
+        return parent_map
+
+    # We only check main components instead of whole model submodules
+    for child in model.children():
+        child_map = getattr(child, "packed_modules_mapping", {})
+        if any((k in parent_map and parent_map[k] != v)
+               for k, v in child_map.items()):
+            raise ValueError(
+                f"Can't update {type(model).__name__}'s packed_modules_mapping "
+                f"safely because of conflicts from {type(child).__name__}.")
+        else:
+            parent_map.update(child_map)
+    return parent_map
\ No newline at end of file
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 815e34d5ac5db9c90e20be4f41193ccf97dfb04f..2ef9f1ccc02be145fd9eae3782e75b7ba56246c6 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .base import MultiModalPlaceholderMap
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 1fd2ab7f87d1f41aa80e2ef68353d578a06fbc73..fbb29276f6bdf255dd4d8edf511f8a232aa71978 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 from io import BytesIO
 from pathlib import Path
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 184c801e64d86ef3b28dd691bf6fd0793da38055..7188ed14c5735717f9f19514a795375442a13396 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index b4cd6a90834c0983b57bd4850350bd66c5090096..b7988359737ac080d6a71394a7c6612db3b436b2 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 from collections.abc import Iterable, Mapping
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a63ec0bd8ada465047b0500d3cf0195c2243e37b..e673632d436649f0ddc8923b2722dfc82d06a4bd 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from io import BytesIO
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 600a34d39ef682d818add491e2149b744b5c921e..5cb720381d94b7e21c2ae012b0ecba7828ba0884 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
@@ -679,7 +680,8 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         return self._items_by_modality.keys()
 
     @staticmethod
-    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
+    def _try_stack(nested_tensors: NestedTensors,
+                   pin_memory: bool = False) -> NestedTensors:
         """
         Stack the inner dimensions that have the same shape in
         a nested list of tensors.
@@ -696,7 +698,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         if isinstance(nested_tensors, (int, float)):
             return torch.tensor(nested_tensors)
 
-        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
+        stacked = [
+            MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors
+        ]
         if not is_list_of(stacked, torch.Tensor, check="all"):
             # Only tensors (not lists) can be stacked.
             return stacked
@@ -712,10 +716,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
 
-        return torch.stack(tensors_)
+        outputs = torch.empty(len(tensors_),
+                              *tensors_[0].shape,
+                              dtype=tensors_[0].dtype,
+                              device=tensors_[0].device,
+                              pin_memory=pin_memory)
+        return torch.stack(tensors_, out=outputs)
 
     @staticmethod
-    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: list["MultiModalKwargs"],
+              pin_memory: bool = False) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -737,7 +747,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
                 item_lists[k].append(v)
 
         return {
-            k: MultiModalKwargs._try_stack(item_list)
+            k: MultiModalKwargs._try_stack(item_list, pin_memory)
             for k, item_list in item_lists.items()
         }
 
@@ -746,17 +756,11 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         batched_inputs: BatchedTensorInputs,
         *,
         device: torch.types.Device,
-        dtype: Optional[torch.dtype] = None,
     ) -> BatchedTensorInputs:
         json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
 
-        def maybe_cast_dtype(x: torch.Tensor):
-            # This mimics the behavior of transformers.BatchFeature
-            return x.to(dtype=dtype) if x.is_floating_point() else x
-
         json_mapped = json_map_leaves(
-            # NOTE: Cast the dtype before sending it to device
-            lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True),
+            lambda x: x.to(device=device, non_blocking=True),
             json_inputs,
         )
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 63af842747a54e42b322d904cedc5913fcb1bbce..cae62b2235e4011e98a64453151269db5cd13b1e 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections import UserDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index aa7914e40cbffaf1752c5087682bbddd998820f4..5cfca57bffeec970d390394f5eab7ca54a9d8fec 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import sys
 from abc import ABC, abstractmethod
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index a85b13fb2387b17419c396b93d9f244281d7e7b4..1faecb7bd24a8588b84b3de4d2d89eb92d889b30 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from abc import ABC
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
 from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast
@@ -60,29 +60,14 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
 
         self.info = info
 
-    # TODO: @abstractmethod after transition
+    @abstractmethod
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         """
         Build the text input corresponding to `mm_counts`.
         """
-        if (type(self).get_dummy_processor_inputs ==
-                BaseDummyInputsBuilder.get_dummy_processor_inputs):
-            raise NotImplementedError
-
-        logger.warning_once("`get_dummy_processor_inputs` has been split up "
-                            "into `get_dummy_text` and `get_dummy_mm_data`. "
-                            "These two methods will be marked as abstract "
-                            "in an upcoming release.")
-
-        seq_len = self.info.ctx.model_config.max_model_len
-
-        prompt = self.get_dummy_processor_inputs(seq_len, mm_counts).prompt
-        if not isinstance(prompt, str):
-            prompt = self.info.get_tokenizer().decode(prompt)
-
-        return prompt
+        raise NotImplementedError
 
-    # TODO: @abstractmethod after transition
+    @abstractmethod
     def get_dummy_mm_data(
         self,
         seq_len: int,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b9f5cee922a70d1ab68cd06a7e53baeb4498ed04..27aaa661c35c86296bfa98b5d9260a5d56394bbe 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 9ddba67bff70280249b5d425cbb3450c19a2b48d..11a25f851546237b8a13d64317dd517df2111223 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import groupby
 from pathlib import Path
@@ -8,10 +9,13 @@ from urllib.parse import ParseResult, urlparse
 import numpy as np
 import numpy.typing as npt
 import torch
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -181,11 +185,15 @@ class MediaConnector:
         """
         image_io = ImageMediaIO(image_mode=image_mode)
 
-        return self.load_from_url(
-            image_url,
-            image_io,
-            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
+        try:
+            return self.load_from_url(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
 
     async def fetch_image_async(
         self,
@@ -200,11 +208,15 @@ class MediaConnector:
         """
         image_io = ImageMediaIO(image_mode=image_mode)
 
-        return await self.load_from_url_async(
-            image_url,
-            image_io,
-            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
+        try:
+            return await self.load_from_url_async(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
 
     def fetch_video(
         self,
@@ -390,3 +402,35 @@ def group_mm_inputs_by_modality(
     return [
         list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
     ]
+
+
+def run_dp_sharded_vision_model(image_input: torch.Tensor,
+                                vision_model: torch.nn.Module) -> torch.Tensor:
+    """Run a vision model with data parallelism (DP) sharding. The function 
+    will shard the input image tensor on the first dimension and run the vision
+    model
+
+    Args:
+        image_input (torch.Tensor): Image input tensor.
+        vision_model (torch.nn.Module): Vision model.
+
+    Returns:
+        torch.Tensor: Output image embeddings
+    """
+
+    num_chunks = image_input.shape[0]
+    mp_world_size = get_tensor_model_parallel_world_size()
+    num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size
+    num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks
+    pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks)
+    image_input_padded = torch.nn.functional.pad(image_input, pad)
+    rank = get_tensor_model_parallel_rank()
+    image_input_per_rank = image_input_padded[rank *
+                                              num_chunks_per_rank:(rank + 1) *
+                                              num_chunks_per_rank, ...]
+
+    vision_embeddings = vision_model(image_input_per_rank)
+    vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings,
+                                                         dim=0)
+    vision_embeddings = vision_embeddings[:num_chunks, ...]
+    return vision_embeddings
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 261d56abad9c60eeb2fb9be9b30653e5acae5cfd..bedb9536e3c9c5b6dc95431ba6b39747ed37a20d 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from abc import abstractmethod
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 33cc50c872b67b67da65c2f8371587c3e48453b1..891305eb7936e09d54e8fbbb1a4d5dd0b9e45fdd 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections.abc import MutableSequence
@@ -7,7 +8,7 @@ from dataclasses import dataclass
 from typing import Any, Generic, Optional, Union
 
 import torch
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -76,14 +77,6 @@ class PoolingOutput:
         return (isinstance(other, self.__class__) and bool(
             (self.data == other.data).all()))
 
-    @property
-    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
-                "attribute. To return embeddings, use `LLM.embed()`. "
-                "To return class probabilities, use `LLM.classify()` "
-                "and access the `probs` attribute. ")
-    def embedding(self) -> list[float]:
-        return self.data.tolist()
-
 
 class RequestOutput:
     """The output data of a completion request to the LLM.
@@ -506,12 +499,6 @@ class ScoringOutput:
     def __repr__(self) -> str:
         return f"ScoringOutput(score={self.score})"
 
-    @property
-    @deprecated("`LLM.score()` now returns scalar scores. "
-                "Please access it via the `score` attribute. ")
-    def embedding(self) -> list[float]:
-        return [self.score]
-
 
 class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
 
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 00d00d05f47ae6159be5a3fbe4fdd5cb5742dc2f..13453d2c4b4b281616197f9a5c8a75fc082dc34c 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import traceback
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index c79c603c02ebccbe7e40ec1518a4dfc15e46076e..27c591e3babd467b7d576f1491a426d62053be8b 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+import platform
 import sys
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Optional
@@ -21,6 +23,15 @@ else:
     VllmConfig = None
 
 
+def get_max_threads(pid=0):
+    if hasattr(os, 'sched_getaffinity'):
+        return len(os.sched_getaffinity(pid))
+    elif platform.system() == 'Darwin':
+        return os.cpu_count()
+    else:
+        raise NotImplementedError("Unsupported OS")
+
+
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
     device_name: str = "cpu"
@@ -28,7 +39,7 @@ class CpuPlatform(Platform):
     dispatch_key: str = "CPU"
 
     @property
-    def supported_dtypes(self) -> list:
+    def supported_dtypes(self) -> list[torch.dtype]:
         if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
             return [torch.bfloat16, torch.float32]
         elif sys.platform.startswith(
@@ -56,7 +67,10 @@ class CpuPlatform(Platform):
             logger.info("Using CPU MLA backend.")
             return "vllm.attention.backends.cpu_mla.CPUMLABackend"
         logger.info("Using Torch SDPA backend.")
-        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+        if use_v1:
+            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+        else:
+            return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -80,6 +94,8 @@ class CpuPlatform(Platform):
         if not model_config.enforce_eager:
             model_config.enforce_eager = True
 
+        model_config.disable_cascade_attn = True
+
         cache_config = vllm_config.cache_config
 
         ipex_available = find_spec("intel_extension_for_pytorch") is not None
@@ -127,7 +143,8 @@ class CpuPlatform(Platform):
                 f" {kv_cache_space}, expect a positive integer value.")
 
         parallel_config = vllm_config.parallel_config
-        if (parallel_config.distributed_executor_backend is not None
+        if (parallel_config.world_size > 1
+                and parallel_config.distributed_executor_backend is not None
                 and parallel_config.distributed_executor_backend != "mp"):
             logger.warning(("%s is not supported on CPU, fallback to mp "
                             "distributed executor backend."),
@@ -140,7 +157,38 @@ class CpuPlatform(Platform):
                 parallel_config.sd_worker_cls = \
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
-                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.cpu_worker.CPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.cpu_worker.CPUWorker"
+
+        # Note: workaround for v1 gpu_model_runner
+        from vllm.config import CompilationLevel
+        vllm_config.compilation_config.cudagraph_capture_sizes = []
+
+        compilation_config = vllm_config.compilation_config
+        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
+                == CompilationLevel.PIECEWISE):
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.backend = "eager"
+            compilation_config.custom_ops += ["none"]
+            compilation_config.inductor_compile_config.update({
+                "dce":
+                True,
+                "size_asserts":
+                False,
+                "nan_asserts":
+                False,
+                "memory_planning":
+                True,
+                "epilogue_fusion":
+                True,
+            })
+
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
 
         assert vllm_config.device_config.device_type == "cpu"
 
@@ -148,12 +196,21 @@ class CpuPlatform(Platform):
         # Environment variables for CPU executor
         #
 
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Note: to avoid the error 'nthreads cannot be larger than environment
+        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
+
         # Set default threads num for OpenMP parallel
         os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
 
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Share the cpusets list among ranks by spawning process instead
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
         # Intel OpenMP setting
         ld_prealod_str = os.getenv("LD_PRELOAD", "")
         if "libiomp5.so" in ld_prealod_str:
@@ -170,13 +227,6 @@ class CpuPlatform(Platform):
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
-        if sys.platform == "darwin" and \
-                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
-            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
-                logger.warning(
-                    "Default to spawn method on MacOS. If this is not desired,"
-                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
-                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
         if vllm_config.model_config and vllm_config.model_config.use_mla:
             logger.info(
@@ -203,3 +253,14 @@ class CpuPlatform(Platform):
         Get device specific communicator class for distributed communication.
         """
         return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8bb3dfe7457adeaf966210b30c5036420bb65130..48d1aacba1858a54a6f612b87035e8549c4996ff 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -1,13 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Code inside this file can safely assume cuda platform, e.g. importing
 pynvml. However, it should not initialize cuda context.
 """
 
 import os
+from datetime import timedelta
 from functools import wraps
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
@@ -103,7 +107,6 @@ class CudaPlatformBase(Platform):
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
-        compilation_config = vllm_config.compilation_config
         model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
@@ -151,20 +154,26 @@ class CudaPlatformBase(Platform):
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLA backend.")
 
-        if (parallel_config.data_parallel_size > 1
-                and compilation_config.use_cudagraph):
+        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+                and parallel_config.data_parallel_size > 1
+                and vllm_config.compilation_config.use_cudagraph):
             logger.info(
-                "Data Parallel: Forcing enforce eager to be True since DP is "
-                "currently not supported with CUDA Graphs.")
+                "Data Parallel: Forcing enforce eager to be True since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            vllm_config.compilation_config.use_cudagraph = False
             vllm_config.model_config.enforce_eager = True
-            compilation_config.use_cudagraph = False
-            # FIXME: inductor breaks cudagraph (from @bnell)
-            compilation_config.use_inductor = False
+            # TODO (varun): Turning this ON gives incorrect results for the
+            # Deepseek-V2-lite model.
+            vllm_config.compilation_config.use_inductor = False
 
     @classmethod
     def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
                                  ) -> float:
+        torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.max_memory_allocated(device)
 
@@ -175,6 +184,14 @@ class CudaPlatformBase(Platform):
         if use_mla:
             # TODO(lucas): refactor to  be more concise
             #  we should probably consider factoring out V1 here
+            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+                if use_v1:
+                    logger.info_once("Using Cutlass MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "cutlass_mla.CutlassMLABackend")
+                else:
+                    logger.warning(
+                        "Cutlass MLA backend is only supported on V1 engine")
             if selected_backend == _Backend.TRITON_MLA or block_size != 64:
                 if use_v1:
                     logger.info_once("Using Triton MLA backend on V1 engine.")
@@ -209,10 +226,28 @@ class CudaPlatformBase(Platform):
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
                 return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+            if selected_backend == _Backend.FLEX_ATTENTION:
+                logger.info("Using FlexAttenion backend on V1 engine.")
+                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
                         "triton_attn.TritonAttentionBackend")
+            if cls.is_device_capability(100):
+                # Prefer FlashInfer for V1 on Blackwell GPUs if installed
+                try:
+                    import flashinfer  # noqa: F401
+                    logger.info_once(
+                        "Using FlashInfer backend on V1 engine by default for "
+                        "Blackwell (SM 10.0) GPUs.")
+                    return ("vllm.v1.attention.backends."
+                            "flashinfer.FlashInferBackend")
+                except ImportError:
+                    logger.info_once(
+                        "FlashInfer failed to import for V1 engine on "
+                        "Blackwell (SM 10.0) GPUs; it is recommended to "
+                        "install FlashInfer for better performance.")
+                    pass
             if cls.has_device_capability(80):
                 logger.info_once("Using Flash Attention backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
@@ -316,6 +351,36 @@ class CudaPlatformBase(Platform):
     def get_piecewise_backend_cls(cls) -> str:
         return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
 
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index a8dd7df9f2e3ebaf9c87e32bfc107c1bca523bcd..3cf28950190c8dd9612b339ee86dd20ebd183ffe 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import TYPE_CHECKING, Optional
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 504c3b42a75d37b1396cb3c01b4efbcc5690fc1a..f91f222b25e58060dde784b1ecbb97c22582df7c 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import os
 import platform
 import random
+from datetime import timedelta
 from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Union
 
 import numpy as np
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
 
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
@@ -44,8 +47,12 @@ class _Backend(enum.Enum):
     ROCM_AITER_MLA_VLLM_V1 = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
+    FLASHINFER_VLLM_V1 = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
+    TRITON_MLA_VLLM_V1 = enum.auto()
+    FLASHMLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()  # Supported by V1
+    CUTLASS_MLA_VLLM_V1 = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
@@ -53,6 +60,7 @@ class _Backend(enum.Enum):
     BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     DUAL_CHUNK_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
+    FLEX_ATTENTION = enum.auto()
 
 
 class PlatformEnum(enum.Enum):
@@ -221,6 +229,30 @@ class Platform:
 
         return current_capability.to_int() >= capability
 
+    @classmethod
+    def is_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform has exactly the specified device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability == capability
+
+        return current_capability.to_int() == capability
+
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         """Get the name of a device."""
@@ -486,6 +518,20 @@ class Platform:
         """
         return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
 
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        """
+        Init platform-specific torch distributed process group.
+        """
+        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 9cd49fd34804983e2afe3d7ac936f2cb031d1fb7..04e918d7aebee74f6b4cee031aad86d373a1f69f 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import os
 from functools import lru_cache
@@ -28,7 +29,7 @@ class NeuronPlatform(Platform):
     device_name: str = "neuron"
     device_type: str = "neuron"
     ray_device_key: str = "neuron_cores"
-    supported_quantization: list[str] = ["neuron_quant"]
+    supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
     device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
 
     @classmethod
@@ -49,9 +50,6 @@ class NeuronPlatform(Platform):
         if parallel_config.world_size > 1:
             parallel_config.distributed_executor_backend = "uni"
 
-        assert (vllm_config.lora_config
-                is None), "LoRA is not supported for Neuron backend."
-
         if vllm_config.cache_config and vllm_config.model_config:
             # neuron needs block_size = max_model_len
             vllm_config.cache_config.block_size = \
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index dc0672ac70829f11076604785b4afbe323894809..8e555b5f9e7601293f84fa11529962ed6586cb89 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+from datetime import timedelta
 from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING, Optional
 
 import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -96,9 +100,21 @@ def with_amdsmi_context(fn):
 
 
 @cache
-def on_mi250_mi300() -> bool:
+def on_gfx1x() -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+    return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
+
+
+@cache
+def on_mi3xx() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
+
+
+@cache
+def on_gfx9() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
 
 
 @cache
@@ -194,8 +210,9 @@ class RocmPlatform(Platform):
                     f" The selected backend, {selected_backend.name},"
                     f"is not MLA type while requested for MLA backend.")
 
-        selected_backend = (_Backend.ROCM_FLASH if selected_backend
-                            == _Backend.FLASH_ATTN else selected_backend)
+        if selected_backend is None or selected_backend == _Backend.FLASH_ATTN:
+            selected_backend = _Backend.ROCM_FLASH
+
         if envs.VLLM_USE_V1:
             logger.info("Using Triton Attention backend on V1 engine.")
             return ("vllm.v1.attention.backends."
@@ -387,3 +404,33 @@ class RocmPlatform(Platform):
     @classmethod
     def get_piecewise_backend_cls(cls) -> str:
         return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0173b15697cfe6ed18395d9e982eb67da61e78b5..07e52017f5a535632a1a56d882c6e77c3b6f7017 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union, cast
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index b2a6ad5d77db60b9e4a5db1ede92fd27fd150681..73f6f3d417671077d5d2817d606c7e2aadac965f 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 2884cb46fecd73f2329f42fc1eb2ed0403d3c1e6..2cb177b9ba789d65c2ef5b613266c5912e2f6024 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import os
@@ -10,6 +11,8 @@ import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
 
+DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins'
+
 # make sure one process only loads plugins once
 plugins_loaded = False
 
@@ -28,19 +31,24 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
         logger.debug("No plugins for group %s found.", group)
         return {}
 
-    logger.info("Available plugins for group %s:", group)
+    # Check if the only discovered plugin is the default one
+    is_default_group = (group == DEFAULT_PLUGINS_GROUP)
+    # Use INFO for non-default groups and DEBUG for the default group
+    log_level = logger.debug if is_default_group else logger.info
+
+    log_level("Available plugins for group %s:", group)
     for plugin in discovered_plugins:
-        logger.info("- %s -> %s", plugin.name, plugin.value)
+        log_level("- %s -> %s", plugin.name, plugin.value)
 
     if allowed_plugins is None:
-        logger.info("All plugins in this group will be loaded. "
-                    "Set `VLLM_PLUGINS` to control which plugins to load.")
+        log_level("All plugins in this group will be loaded. "
+                  "Set `VLLM_PLUGINS` to control which plugins to load.")
 
     plugins = dict[str, Callable[[], Any]]()
     for plugin in discovered_plugins:
         if allowed_plugins is None or plugin.name in allowed_plugins:
             if allowed_plugins is not None:
-                logger.info("Loading plugin %s", plugin.name)
+                log_level("Loading plugin %s", plugin.name)
 
             try:
                 func = plugin.load()
@@ -80,7 +88,7 @@ def load_general_plugins():
             # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
             os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
 
-    plugins = load_plugins_by_group(group='vllm.general_plugins')
+    plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
         func()
diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py
index 219231f777852a344cf71235caee56ac17efd9d5..b999d07a6eb74f05b11ae12d88c03e238fbd6437 100644
--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 from typing import Optional
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 9a3b254f9b68c3e603c0f09da54ae5249ff732a2..322f9ed3efa9fb1f62dd832b58cfe76109669c95 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Any, Optional
 
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 6934d328a87ef2494e04b038d2316e2ddca845b4..2f9ebe531cbb1da612a6ce7d18498b96468a98a3 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from collections import defaultdict
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index b26fd4dd8c071ef2d6857aca447f2f70f7748dad..9f0f56a15fd5362255599fa053aeb35a18aa13ca 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Callable, Union
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
index c2f9f16919b7fc7bbb89899b615dc06e415ae6bd..b5b925d042f23a1dd791d713eaf2bcc16e4ecb46 100644
--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 795591606f25941ee72482b5e0b1b27304974d79..864b50c861e197715bdbbdfbce826199df02d807 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import math
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
index dfb8e61d786a0f81bf617f33a3a06957cdab02ac..3ce50d0a26bb04d0b3f58c415fc108d8f35fb076 100644
--- a/vllm/prompt_adapter/request.py
+++ b/vllm/prompt_adapter/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import msgspec
 
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index dd179ab938f8346f2d48881532fa4e7d5b59798a..ddd007868f6bfd9a38cb730f8a885d329fd84745 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
 
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
index 28dcc16871120e03646df779e7bddd2b824d64ff..56265de8087c0062adc87f419838c0f727774d5b 100644
--- a/vllm/prompt_adapter/worker_manager.py
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 from typing import Any, Optional, Set, Type
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 65606ce55af72eeca6160eb97fdd5b09ce3a0267..e8cd565519f36dca1e7437a87ef2ad793fcf7b9c 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 9dd5191da91847a1dd0541d0772f81c0cb00d03d..e827d381ca1d2a22044edebdc50ab541437eefd7 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 1c283c092a28c145e99c70c11d65ae69fc0d1df0..1a5ca46a60f1d8ec12d5334fc596b605628051d0 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 07a63e294df49c7870bdc2cbfad4fb79c24dec7b..5820001b918f66e6aaf76d912409f476c3f309bb 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 7095034b1ca173559aefe10089058561290ba517..61bafc724c17f8bd931e7f8db2cb90fc1f16324c 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index dc38daa388cedbc4da6a623ce000f9812cb3ee89..7abdcecca4746ce656dc59eb19d9a0e48bdbef70 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampling parameters for text generation."""
 import copy
 from dataclasses import dataclass
@@ -389,6 +390,17 @@ class SamplingParams(
                              f"type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
+        if self.best_of is not None:
+            if not isinstance(self.best_of, int):
+                raise ValueError(
+                    f"best_of must be an integer, got {type(self.best_of)}")
+            if self.best_of < 1:
+                raise ValueError(
+                    f"best_of must be at least 1, got {self.best_of}")
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError("presence_penalty must be in [-2, 2], got "
                              f"{self.presence_penalty}.")
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index fc1761c84cd11fc38ab25fea2f460d95d51fb471..9060b55c79b0185c5dd22a2940e4363f69ccabeb 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 import struct
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 7e569d2d24fd620617be4db785d1c319d562d5c9..7a7fdccf0a32bf06baa24cde1e5b8d70d8b962a9 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.entrypoints.cli.main import main as vllm_main
 from vllm.logger import init_logger
diff --git a/vllm/sequence.py b/vllm/sequence.py
index d359f897da25e33352f7fcf299cc75bce732734f..ffe890eb2dab4b9f8c9aeb5a1f5ef1d6919ca03d 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sequence and its related classes."""
 import copy
 import enum
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index e08ed742a5225186880dc60dc86017bcdc334bd7..f9b882469a4df6924ce95edf42d0a2a8c82ffc7b 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from itertools import chain, count
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 991d2040a878a41d35b41e16e3f8457c1d0a3265..96646ec94718659c6838e936a9771577425677b5 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional
 
@@ -296,7 +297,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(
                         multi_modal_kwargs,
-                        dtype=self.model_runner.model_config.dtype,
                         device=self.device,
                     ),
                     **model_execute_kwargs,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index dd085ad77638462535cf2c3d9def11e8647de965..70ec1590e7ad0342158872fc10143553dd0af712 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 0b62a988e8b267aed2ecad09cba59eec38808b74..82b5a79fa7cb901646f99f20ba0ce28c321ae630 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 4430da26c0493f1beef505d0555b47abc2b120ad..a4784cad962d06a064b22ad0bc5dcb220914a0ab 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from typing import Callable, Optional, Union
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index bdaf31895e25dee2e8ec47ff6f0a41f2c208f623..8e8c05d26361bd20c25a196a9986fad0386a871b 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index 6275c460ecefa0aaca2fe2d6be7e3dc90ccd3aa0..18e7b055a67823fef5ff39f547c14c13b119e399 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.sequence import (ExecuteModelRequest, SequenceData,
                            SequenceGroupMetadata, get_all_seq_ids)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index de57403d1b50e823234ac9e77f14870cd007db6f..4a9bbe44d89a03b49e40e03978f7b4404f6537c9 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import weakref
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 57ae173af6744f7ea7713fc38501c635793a431b..7a1a0e56dc00b7c471a6ef1d1a17c129d9283858 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index 2829d631b49ee70e8d69c427a991158bf400bb77..fb44275aa93574cf48d16d57525fa1dc1f99ae22 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index ea3d91d7893bb655b41e737f7cc9e7ef3694b938..91256cab6e7993b72169073cec46fb861b2b2af3 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 252c80957305b0c3f6aec61867465eb50cc1f109..7dda1cbfe230241f69df4613912f97a0bef37b33 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from collections import defaultdict
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 08e773c562bf83f7f9fd3928b0aa30881f25e526..ca89eb60ac583f2c4a0be9906474cb20bef97277 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional
 
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index b538923c03e74a95e307302bfb9f2d2fe976d838..afd91b42b9433f9abf95201dad8752a95328c65d 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 466269b2107f5b44e284938c756fb00507059052..22d2a4833acf9a884531734526d02f2e27d38c94 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from contextlib import contextmanager
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index f8cec380f336eca5c0dbd23e31278187e0fffdbd..c6b126d002b2da59f69a8273a3c33bf00b0bc63d 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
     "ai21labs/AI21-Jamba-1.5-Mini",
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index 7ed9ced0e2620c108a3424b4d8e29f7506bd1fb7..d215e5d8bf657a35c1ee9059b51338e9b1f07220 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # copied from https://pypi.org/project/nvidia-ml-py
 # version 12.570.86
 
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 557ae40b87aeec3dba7914e0990f251b528d375c..6a287d82be5ffbabcd732a25d340e9a306e75fce 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Mapping
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index 84bd7a747656439dfe5d00a75d4677e17e0fd5af..6d4231baca50b390c22fbd6cd037a65ff9797138 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import envs
 
diff --git a/vllm/transformers_utils/chat_templates/__init__.py b/vllm/transformers_utils/chat_templates/__init__.py
index fe2bd3ca4125325e9e9ac4d8882a1cd884b17231..2783d12a22147279e95309885ce583605e45fb9a 100644
--- a/vllm/transformers_utils/chat_templates/__init__.py
+++ b/vllm/transformers_utils/chat_templates/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .registry import get_chat_template_fallback_path
 
 __all__ = ["get_chat_template_fallback_path"]
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index 853fed5d4409d66dff432c2af8f93ea7801797ec..e0ef7f0999d4771351e6061e408c61ffb94c02ee 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 from typing import Callable, Optional, Union
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 69e7207cc3500bb30d688d774cce077b890923ef..52a7a903cd8efa1529b844a8cafe8251db17b96e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,15 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import json
 import os
 import time
-from functools import cache
+from functools import cache, partial
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Union
+from typing import Any, Callable, Literal, Optional, TypeVar, Union
 
 import huggingface_hub
-from huggingface_hub import hf_hub_download
+from huggingface_hub import get_safetensors_metadata, hf_hub_download
 from huggingface_hub import list_repo_files as hf_list_repo_files
 from huggingface_hub import try_to_load_from_cache
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
@@ -22,6 +23,7 @@ from transformers.models.auto.image_processing_auto import (
     get_image_processor_config)
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
@@ -93,10 +95,15 @@ class ConfigFormat(str, enum.Enum):
     MISTRAL = "mistral"
 
 
-def with_retry(func: Callable[[], Any],
-               log_msg: str,
-               max_retries: int = 2,
-               retry_delay: int = 2):
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
     for attempt in range(max_retries):
         try:
             return func()
@@ -109,6 +116,8 @@ def with_retry(func: Callable[[], Any],
             time.sleep(retry_delay)
             retry_delay *= 2
 
+    raise AssertionError("Should not be reached")
+
 
 # @cache doesn't cache exceptions
 @cache
@@ -134,7 +143,9 @@ def list_repo_files(
                     modelscope_list_repo_files)
                 return modelscope_list_repo_files(repo_id,
                                                   revision=revision,
-                                                  token=token)
+                                                  token=os.getenv(
+                                                      "MODELSCOPE_API_TOKEN",
+                                                      None))
             return hf_list_repo_files(repo_id,
                                       revision=revision,
                                       repo_type=repo_type,
@@ -823,13 +834,54 @@ def try_get_generation_config(
 
 
 def get_cross_encoder_activation_function(config: PretrainedConfig):
-    if (hasattr(config, "sbert_ce_default_activation_function")
-            and config.sbert_ce_default_activation_function is not None):
 
+    function_name: Optional[str] = None
+    if hasattr(config, "sentence_transformers") and "activation_fn" in \
+        config.sentence_transformers:
+        function_name = config.sentence_transformers["activation_fn"]
+
+    elif (hasattr(config, "sbert_ce_default_activation_function")
+          and config.sbert_ce_default_activation_function is not None):
         function_name = config.sbert_ce_default_activation_function
+
+    if function_name is not None:
         assert function_name.startswith("torch.nn.modules."), \
             "Loading of activation functions is restricted to " \
             "torch.nn.modules for security reasons"
         return resolve_obj_by_qualname(function_name)()
     else:
         return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
+
+
+def try_get_safetensors_metadata(
+    model: str,
+    *,
+    revision: Optional[str] = None,
+):
+    get_safetensors_metadata_partial = partial(
+        get_safetensors_metadata,
+        model,
+        revision=revision,
+        token=os.getenv('HF_TOKEN', None),
+    )
+
+    try:
+        return with_retry(get_safetensors_metadata_partial,
+                          "Error retrieving safetensors")
+    except Exception:
+        return None
+
+
+def try_get_tokenizer_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[dict[str, Any]]:
+    try:
+        return get_tokenizer_config(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+    except Exception:
+        return None
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ed10c22c84f08ebf4c7d948248f6933dd008eae7..97a1b683a9b838b61310cb1910ff4d1d0c60ab18 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
@@ -22,6 +23,7 @@ from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
@@ -49,6 +51,7 @@ __all__ = [
     "MoonViTConfig",
     "KimiVLConfig",
     "NemotronConfig",
+    "NemotronHConfig",
     "NVLM_D_Config",
     "OvisConfig",
     "SkyworkR1VChatConfig",
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 2261f0a9e9aac6948000a11152bbc1e8dfbb95e4..a789b93b5edff0f079707c515c3a9e2f2c90e42c 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 43e9503ffe03f5e4eae1675b3cc215c393591df6..7c5de3e948ed7230c85ee467631bed17b26c8f26 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
index 21328d7675b8207e5d854282b955025a03f7c61f..e547a9c281cffbebb5d9b03f39a53fb52a3e1ae2 100644
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index bffa127fecb254a17f349ac8a52257738f5f97cb..7dbda99f85a4ec01206624f8b2c78b6b9cabb59e 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index a54486fa41cd1169e4ca99c50e35b72bc2d4dbbe..957d638318410e36fa3684549be09aaf86455a20 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 377523efefc301ce777abf2d33544ed7d8575957..fb2e8a1df7052e97c00f31f521687038f1da6354 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
@@ -70,7 +71,7 @@ class EAGLEConfig(PretrainedConfig):
 
         if self.model is not None:
             for k, v in self.model.to_dict().items():
-                if not hasattr(self, k):
+                if k not in kwargs:
                     setattr(self, k, v)
 
     @classmethod
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 25bafbb85d3064b08d0991cf4768bb0346d6b555..7450904a15cafa1b7fb16e90c466eb1c7c376315 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py
index f161a06f34238204ec034a21a88dbf92c047082a..2f5400463d91adf86cd9767a5fba17c825fee8ac 100644
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
index 48b5d79ff950ba6f4332bf0cf0c6b4a194a4fca6..b36a6dd59d3d3cb1101e6105e0fc5c610180285a 100644
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
index 8ea62546e21336107ac7bb509b857978f700f13e..4494ebfef667f06a36e5e5f3d35feba9d6c5953b 100644
--- a/vllm/transformers_utils/configs/internvl.py
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index b947c6a9e2b4b9b1da18d7109b8ae4e9d04e4aab..767c4ddae870db75a51d58018885857f71a0dd0f 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
index 97ff44bb9c1c99f60fa77059936600091958261a..ae8dac0f381d658879c82b4fcaad3f97ca46a049 100644
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index 885713c5d6cd06cd3d4a411b370417f94c41cced..9ba52956a8e8efc80a59c90f7f66a07234da690b 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py
index 660e870ac62d876b2978c8e94e1aa2132924965d..e3b63dfa00371e3623b412d236aaa2bfa19eada9 100644
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ b/vllm/transformers_utils/configs/minimax_text_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ MiniMaxText01 model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py
index 99e0d249dc5a7e7dffeef41e165ef72dc39daef3..c62497192cc2a77533094d8d6acb9a984a1884ba 100644
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ b/vllm/transformers_utils/configs/minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """MiniMaxVL01 model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
index eb77e09adca489bf0aecf46ef66d15884d3cdb56..f0cd2d52a529eb98aaa9948de5d520e2941c20c1 100644
--- a/vllm/transformers_utils/configs/mllama.py
+++ b/vllm/transformers_utils/configs/mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from transformers.models.mllama import configuration_mllama as mllama_hf_config
 
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 70f60752905cbc7f50d7a41e1d8c1ad0450ecea8..2fa284e5c9e8f26ca90627057e9e316b7d6ae8ac 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py
index a2b4059a63efb69f0bb1f88033a57047a724b6d4..a6f712f3d60059875980e02acaf99a365985f92a 100644
--- a/vllm/transformers_utils/configs/moonvit.py
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 from transformers.configuration_utils import PretrainedConfig
 
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 2d52658d3973c881390dc32c83f53f5ebd0ca528..91316408dcd89da16da4c65c30bc29bdb13e1fef 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index fdf4fa2a53e5706eeed69f0ce8a6d1ec81584d74..d65b572dc7f22eaa9962d00b180188b011d532af 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe75f2dfeea8b2d8266428e9fca16b9148a373f
--- /dev/null
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has a output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        attention_head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
+            "hybrid_override_pattern must have same length as "
+            "num_hidden_layers")
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
+            "hybrid_override_pattern must only contain characters "
+            "'M', '*', or '-'")
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            for i in range(self.num_hidden_layers)
+        ]
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
index 300f6e21168e55e7d15a9fc87a41878ccaa82cbb..a533720af6c6656d357461a2944e3c85b43aa624 100644
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
index 0ec224214f067099135f0ce5542b23f39d9c73be..c2728f0ed64c979c0c08a0b8fba77238712cddc6 100644
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
index ef5f9ba85c2376368debaabbe4380147fbed1833..33a45220e3159f0989366f6f9f7d4c0718cfbd57 100644
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index 6eaf699d17beef1481f755b32cdd62e1c84f537a..a83dfa40b43a57532a037c0278b30c3c1c8f239b 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
index 5da6c5b4427ea1f7ed317467ed535f887c63330c..050a7851d143f3ba55268bcd2dde853084498445 100644
--- a/vllm/transformers_utils/configs/telechat2.py
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
 """ Telechat configuration compatible with LlamaConfig. """
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 4c50724272634bda5186da01360d5d3f9c34f3b7..62f63b02d49a4cbec2389f5a263d58f569f94449 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
 from typing import Any, Optional
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 3adf2e32cca7ce77b37e526cb197d37a730a2c7c..380c62a141f0f23fefb4b1074382896860867ba5 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 7373fa0ede2373b3f5a6da6983cad430a0e573eb..342632989d579925e02e51d40c8015b240eee581 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index ce6427de432da5e37fd61b481d2e9ba4e16f0c14..70cd08263d3721434763d86dc39ef3534705d7fa 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 2bd9ab1f099b39ee75893b0613b20b8bf3eb621c..14d15f2bc1673821de69371ef17e0924929abba8 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index df960e9c7aa8fb2ee082a3e201dcdbe842c7b64d..b4669d12fa213b688b5210e6a7aaca067575a769 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index f1c6407e1f3a3271763a61352b50bd9cfb5cddec..4fe76d0df622bbf1eccc7a88c5c7f5451ecfbb95 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 1c3520bcfb278ea8d3dd3b4dc951d1bcb434345b..f95aae7815e0bd9418c2ec3be097720fd70c0ec5 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
 import os
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index fa7a208c48ed7a6e17ad8bb1a73a65affcaff50f..ae96ebe4eaa26f33e2d10c08b4f58d2548703684 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import copy
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index d69e5a6b425137c4b58925ec7ea9ba5012e245c0..20e5fea714e7045d68b4a329cfd2ba881b18fa8d 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from abc import ABC, abstractmethod
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index 8b9e4881ef88fe5a1d0f19571a48387d3e01b5a0..eb53cceaa058570c3d1166311aabe215ee2d2030 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 7aac29a6bf967eef2136f506d298a897dc98f6e6..941156c4bf50e8cb795dcfdbf8088e7b4c8a3394 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
                       truncate_tool_call_ids, validate_request_params)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 23b6f67f09df77bca99c84df10122f0fcb1ddcb3..24ac4580d670e9dbfdcd15fe8cac24b3e28c58a6 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from dataclasses import dataclass
@@ -186,6 +187,8 @@ class MistralTokenizer(TokenizerBase):
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
         self.instruct = tokenizer.instruct_tokenizer
+        _mistral_version_str = self.instruct.tokenizer.version.value
+        self.version: int = int(_mistral_version_str.split("v")[-1])
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
         from mistral_common.tokens.tokenizers.tekken import (
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 8dff1b612fdbba6abdf966d07ff695ade509b978..66c8fb797adcde0a1672c9d99f1e1b04e209d2de 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from functools import cache
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 9f14a907af3a5db111b8c1066618452eb23bc0cb..0fcf5d15afd1dbee06a7b54e7454b7fcdfbef6e0 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
                                          TritonPlaceholder)
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 8cf2e01a33bd6bd138afc67c82a7b65233f2d9a1..068fa303137c1eea1bc509f38e63b7e6970f9657 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import types
 from importlib.util import find_spec
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 90af0c63cc02b87f5d8da2c55e9373c8aaec492a..c149637635b770567a350c908a9b7be819e33f80 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
 import json
diff --git a/vllm/utils.py b/vllm/utils.py
index 63027c017901836dcc463817d47bb504c5176c9b..fd1395393decf9c14b2d229363e8c5f8e0c79481 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
@@ -37,8 +38,8 @@ from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
                       _ArgumentGroup)
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
-from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
-                             Iterable, Iterator, KeysView, Mapping)
+from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator,
+                             Hashable, Iterable, Iterator, KeysView, Mapping)
 from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
@@ -107,7 +108,7 @@ STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = (
     "currently not supported for encoder/decoder "
     "models.")
 
-STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is currently not currently "
+STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently "
                              "supported with encoder/decoder "
                              "models.")
 
@@ -898,6 +899,7 @@ class DeviceMemoryProfiler:
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
         from vllm.platforms import current_platform
+        gc.collect()
         return current_platform.get_current_memory_usage(self.device)
 
     def __enter__(self):
@@ -979,6 +981,53 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
+# bool = 0, int = 1, float = 2, complex = 3
+def _get_precision_level(dtype: torch.dtype) -> int:
+    # NOTE: Complex dtypes return `is_floating_point=False`
+    return ((dtype != torch.bool) + dtype.is_floating_point +
+            dtype.is_complex * 2)
+
+
+def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
+    """
+    Test whether it is lossless to cast a tensor from
+    `src_dtype` to `tgt_dtype`.
+    """
+    if src_dtype == tgt_dtype:
+        return True
+
+    src_level = _get_precision_level(src_dtype)
+    tgt_level = _get_precision_level(tgt_dtype)
+
+    if src_level < tgt_level:
+        return True
+    if src_level > tgt_level:
+        return False
+
+    # Compare integral types
+    if not src_dtype.is_floating_point and not src_dtype.is_complex:
+        src_info = torch.iinfo(src_dtype)
+        tgt_info = torch.iinfo(tgt_dtype)
+        return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+
+    # Compare floating-point types
+    src_info = torch.finfo(src_dtype)
+    tgt_info = torch.finfo(tgt_dtype)
+    return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
+            and src_info.resolution >= tgt_info.resolution)
+
+
+def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
+    """
+    Get the common `dtype` where all of the other `dtypes` can be
+    cast to it without losing any information.
+    """
+    return max(
+        dtypes,
+        key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
+    )
+
+
 # `collections` helpers
 def is_list_of(
     value: object,
@@ -1410,17 +1459,24 @@ class FlexibleArgumentParser(ArgumentParser):
         if '--config' in args:
             args = self._pull_args_from_config(args)
 
+        def repl(match: re.Match) -> str:
+            """Replaces underscores with dashes in the matched string."""
+            return match.group(0).replace("_", "-")
+
+        # Everything between the first -- and the first .
+        pattern = re.compile(r"(?<=--)[^\.]*")
+
         # Convert underscores to dashes and vice versa in argument names
         processed_args = []
         for arg in args:
             if arg.startswith('--'):
                 if '=' in arg:
                     key, value = arg.split('=', 1)
-                    key = '--' + key[len('--'):].replace('_', '-')
+                    key = pattern.sub(repl, key, count=1)
                     processed_args.append(f'{key}={value}')
                 else:
-                    processed_args.append('--' +
-                                          arg[len('--'):].replace('_', '-'))
+                    key = pattern.sub(repl, arg, count=1)
+                    processed_args.append(key)
             elif arg.startswith('-O') and arg != '-O' and len(arg) == 2:
                 # allow -O flag to be used without space, e.g. -O3
                 processed_args.append('-O')
@@ -1879,14 +1935,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
 
 
-def is_in_doc_build() -> bool:
-    try:
-        from sphinx.ext.autodoc.mock import _MockModule
-        return isinstance(zmq, _MockModule)
-    except ModuleNotFoundError:
-        return False
-
-
 def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     """
     Import a Python file according to its file path.
@@ -2222,6 +2270,8 @@ def kill_process_tree(pid: int):
 class MemorySnapshot:
     """Memory snapshot."""
     torch_peak: int = 0
+    free_memory: int = 0
+    total_memory: int = 0
     cuda_memory: int = 0
     torch_memory: int = 0
     non_torch_memory: int = 0
@@ -2241,8 +2291,8 @@ class MemorySnapshot:
         self.torch_peak = torch.cuda.memory_stats().get(
             "allocated_bytes.all.peak", 0)
 
-        self.cuda_memory = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.cuda_memory = self.total_memory - self.free_memory
 
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
@@ -2255,6 +2305,8 @@ class MemorySnapshot:
     def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
+            free_memory=self.free_memory - other.free_memory,
+            total_memory=self.total_memory - other.total_memory,
             cuda_memory=self.cuda_memory - other.cuda_memory,
             torch_memory=self.torch_memory - other.torch_memory,
             non_torch_memory=self.non_torch_memory - other.non_torch_memory,
@@ -2276,6 +2328,16 @@ class MemoryProfilingResult:
     after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     profile_time: float = 0.0
 
+    def __repr__(self) -> str:
+        return (f"Memory profiling takes {self.profile_time:.2f} seconds. "
+                f"Total non KV cache memory: "
+                f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+                f"torch peak memory increase: "
+                f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+                f"non-torch forward increase memory: "
+                f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
+                f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.")
+
 
 @contextlib.contextmanager
 def memory_profiling(
@@ -2415,7 +2477,7 @@ def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str:
     Returns:
         A properly formatted ZMQ path string.
     """
-    if not port:
+    if port is None:
         return f"{scheme}://{host}"
     if is_valid_ipv6_address(host):
         return f"{scheme}://[{host}]:{port}"
@@ -2427,6 +2489,9 @@ def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
     socket_type: Any,
+    bind: Optional[bool] = None,
+    identity: Optional[bytes] = None,
+    linger: Optional[int] = None,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
@@ -2446,7 +2511,7 @@ def make_zmq_socket(
         buf_size = -1  # Use system default buffer size
 
     if bind is None:
-        bind = socket_type != zmq.PUSH
+        bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB)
 
     if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
         socket.setsockopt(zmq.RCVHWM, 0)
@@ -2459,6 +2524,9 @@ def make_zmq_socket(
     if identity is not None:
         socket.setsockopt(zmq.IDENTITY, identity)
 
+    if linger is not None:
+        socket.setsockopt(zmq.LINGER, linger)
+
     # Determine if the path is a TCP socket with an IPv6 address.
     # Enable IPv6 on the zmq socket if so.
     scheme, host, _ = split_zmq_path(path)
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a580c2883c357e332cb58d1422084c1b3801de
--- /dev/null
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
+                                                TorchSDPAMetadata)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.ipex_attn import PagedAttention
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+class TorchSDPABackend:
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    def get_name() -> str:
+        return "TORCH_SDPA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
+        return TorchSDPABackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TorchSDPAMetadata
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
+        return TorchSDPAMetadataBuilderV1
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+class TorchSDPAMetadataBuilderV1:
+
+    def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable) -> None:
+        self.runner = runner
+        self.block_table = block_table
+
+        # For reorder
+        self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.num_prompt_req: int = 0
+
+        self.seq_start_loc_cpu = torch.zeros(
+            runner.max_num_reqs + 1,
+            dtype=torch.int32,
+            device="cpu",
+        )
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        prompt_list_idx = 0
+        decode_list_idx = 0
+        for req_index in range(input_batch.num_reqs):
+            if input_batch.num_computed_tokens_cpu[
+                    req_index] < input_batch.num_prompt_tokens[req_index]:
+                # prompt stage
+                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
+                prompt_list_idx += 1
+            else:
+                # decode stage
+                self.reorder_decode_req_index_list[decode_list_idx] = req_index
+                decode_list_idx += 1
+        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
+
+        # Update prompt requests number
+        self.num_prompt_req = prompt_list_idx
+
+        reorder_req_num = 0
+        for req_index in range(decode_list_idx):
+            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
+                reorder_req_num += 1
+            else:
+                break
+
+        if reorder_req_num == 0:
+            return False
+
+        reorder_prompt_list = (
+            self.reorder_prompt_req_index_list[:prompt_list_idx]
+            [-reorder_req_num:])
+        reorder_decode_list = (
+            self.reorder_decode_req_index_list[:decode_list_idx]
+            [:reorder_req_num])
+        assert reorder_decode_list.size == reorder_prompt_list.size
+
+        for idx in range(reorder_req_num):
+            prompt_req_index = reorder_prompt_list[idx].item()
+            decode_req_index = reorder_decode_list[idx].item()
+            input_batch.swap_states(prompt_req_index, decode_req_index)
+
+        return True
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
+        runner = self.runner
+        block_table = self.block_table
+        seq_lens_np = runner.seq_lens_np[:num_reqs]
+        num_prompt_req = self.num_prompt_req
+        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
+        ) if num_prompt_req > 0 else 0
+        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
+        ) if num_prompt_req < num_reqs else 0
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1])
+        num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item()
+        num_decode_tokens = runner.query_start_loc_np[num_reqs].item(
+        ) - num_prefill_tokens
+        slot_mapping = block_table.slot_mapping_cpu[:num_actual_tokens].long()
+        block_table_tensor = block_table.get_device_tensor()
+        attn_metadata = TorchSDPAMetadata(
+            num_prefills=num_prompt_req,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=runner.
+            seq_lens_cpu[num_prompt_req:num_reqs],  # decode
+            max_decode_seq_len=max_decode_seq_len,  # decode
+            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
+            chunked_prefill=True,
+            max_query_len=max_query_len,
+            max_kv_len=max_prefill_seq_len,
+            prefill_query_start_loc=runner.
+            query_start_loc_cpu[:num_prompt_req + 1],  # prefill
+            kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req +
+                                                1],  # prefill
+            prefill_block_tables=block_table_tensor[:
+                                                    num_prompt_req],  # prefill
+            query_start_loc=runner.query_start_loc_cpu[:num_reqs +
+                                                       1],  # for logits index
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+        )
+
+        return attn_metadata
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 803220838a624da824dd013703fc03cc5dd76fcd..54c76e146bb5eac4f910a3e20fad884861ccedaa 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
@@ -15,6 +16,8 @@ from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
                                            get_flash_attn_version)
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    get_kv_connector_cache_layout)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -69,6 +72,20 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
+        # NOTE When running disaggregated PD with NIXL, HND layout is used for
+        # faster transfer. `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        cache_layout = get_kv_connector_cache_layout()
+        if cache_layout == "NHD":
+            stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND":
+            stride_order = (0, 1, 3, 2, 4)
+        else:
+            raise ValueError("Unknown cache layout format %s.", cache_layout)
+        return stride_order
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -306,13 +323,14 @@ class FlashAttentionMetadataBuilder:
         self.kv_cache_spec = kv_cache_spec
         self.block_table = block_table
 
-        if get_flash_attn_version() == 3:
-            self.aot_schedule = not compilation_config.full_cuda_graph
-            if not self.aot_schedule:
-                logger.warning(
-                    "AOT Schedule is disabled when using full_cuda_graph")
-        else:
-            self.aot_schedule = False
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.use_full_cuda_graph = compilation_config.full_cuda_graph
+        if self.use_full_cuda_graph and not self.aot_schedule:
+            raise ValueError("Full CUDA graph mode requires AOT scheduling, "
+                             "which requires FlashAttention 3.")
+        self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1,
+                                              dtype=torch.int32,
+                                              device=self.runner.device)
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
@@ -330,7 +348,7 @@ class FlashAttentionMetadataBuilder:
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata):
-        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max())
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table = self.block_table
@@ -468,6 +486,18 @@ class FlashAttentionMetadataBuilder:
                                           max_seq_len=max_seq_len,
                                           causal=True)
 
+        if self.use_full_cuda_graph:
+            assert scheduler_metadata is not None
+            n = scheduler_metadata.shape[0]
+            self.scheduler_metadata[:n].copy_(scheduler_metadata,
+                                              non_blocking=True)
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
@@ -505,6 +535,7 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
@@ -526,6 +557,7 @@ class FlashAttentionImpl(AttentionImpl):
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -589,22 +621,26 @@ class FlashAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
-        # Reshape the input keys and values and store them in the cache.
-        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-        # not padded. However, we don't need to do key[:num_actual_tokens] and
-        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
-        # the slot_mapping's shape to determine the number of actual tokens.
         key_cache, value_cache = kv_cache.unbind(0)
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fn)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1c4f7f62fa675e8b085f0fd88d9f37ad84d5c542..b15bb4b3152afde7eb721e454dc66cf9173ab58f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashInfer."""
 from __future__ import annotations
 
@@ -506,7 +507,13 @@ class FlashInferImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -520,6 +527,7 @@ class FlashInferImpl(AttentionImpl):
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -567,21 +575,25 @@ class FlashInferImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
-        # Reshape the input keys and values and store them in the cache.
-        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-        # not padded. However, we don't need to do key[:num_actual_tokens] and
-        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
-        # the slot_mapping's shape to determine the number of actual tokens.
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            kv_cache[:, 0],
-            kv_cache[:, 1],
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         window_left = (self.sliding_window[0]
                        if self.sliding_window is not None else -1)
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b473b1461a68177041961ee5e10331664d15214
--- /dev/null
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashAttention."""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
+                                               _score_mod_signature,
+                                               create_block_mask,
+                                               flex_attention)
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+
+if current_platform.is_cuda():
+    pass
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+create_block_mask_compiled = torch.compile(create_block_mask,
+                                           fullgraph=True,
+                                           mode="reduce-overhead")
+flex_attention_compiled = torch.compile(flex_attention, fullgraph=True)
+
+
+def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
+    device = offsets.device
+    counts = offsets[1:] - offsets[:-1]
+    return torch.repeat_interleave(
+        torch.arange(len(counts), device=device, dtype=torch.int32), counts)
+
+
+class FlexAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [16, 32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLEX_ATTENTION"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlexAttentionImpl"]:
+        return FlexAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return FlexAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_builder_cls() -> type["FlexAttentionMetadataBuilder"]:
+        return FlexAttentionMetadataBuilder
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+# @torch.compile(fullgraph=True, mode="reduce-overhead")
+def physical_to_logical_mapping(
+        block_table: torch.Tensor,
+        total_blocks: Optional[int] = None) -> torch.Tensor:
+    """
+    Creates an inverse mapping from physical block locations to logical indices.
+
+    The original block_table maps from logical blocks to physical locations:
+
+    Logical to Physical (Original block_table):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Logical Blocks:  0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Physical Blocks: 3  5  1  7  4  2  0  6   │
+    └───────────────────────────────────────────┘
+
+    This function creates the inverse mapping:
+
+    Physical to Logical (Inverse mapping):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Physical Blocks: 0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Logical Blocks:  6  2  5  0  4  1  7  3   │
+    └───────────────────────────────────────────┘
+
+    If multiple logical blocks map to the same physical block,
+    this function returns the first (minimum) logical block index.
+
+    If a physical block is not mapped to by any logical block,
+    its value in the result will be -1.
+
+
+    Args:
+        block_table: Tensor of shape [max_reqs, max_num_blocks]
+            mapping logical blocks to physical locations
+
+    Returns:
+        A tensor of shape [max_reqs, max_physical_block]
+    """
+    max_reqs, max_num_blocks = block_table.shape
+    device = block_table.device
+
+    physical_to_logical = torch.full((max_reqs, total_blocks),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+    logical_indices = (torch.arange(max_num_blocks,
+                                    device=device).unsqueeze(0).expand(
+                                        max_reqs, -1))
+
+    physical_to_logical.scatter_(-1, block_table.to(torch.int64),
+                                 logical_indices)
+    # TODO Confirm - Seems like block 0 is always empty so we reset it manually
+    physical_to_logical[:, 0] = -1
+    return physical_to_logical
+
+
+def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
+                    kv_idx: torch.Tensor):
+    return q_idx >= kv_idx
+
+
+@dataclass
+class FlexAttentionMetadata:
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Block info
+    total_cache_tokens: int
+    block_size: int
+    max_possible_sequence_length: int
+    num_reqs: int
+    physical_to_logical: torch.Tensor
+    decode_offset: torch.Tensor
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    # Flex Metadata
+    num_blocks = 0
+    block_mask: Optional[BlockMask] = None
+    score_mod: Optional[_score_mod_signature] = None
+    mask_mod: Optional[_mask_mod_signature] = None
+    logical_mask_mod: _mask_mod_signature = causal_mask_mod
+
+    def get_mask_mod(self) -> _mask_mod_signature:
+        """Creates the mask_mod function for FlexAttention.
+
+        This function creates the combined mask mod function that handles:
+            1. The paged attention block mapping
+            2. The mapping from packed query sequences to logical query entries
+
+        It also by defaults adds the decoding offset to the query indices.
+        With this info we create the "logical" indices that are passed to
+        mask_mod functions. This allows mask mod functions to be agnostic to
+        layout of the query and key/value tensors.
+
+        TODO is_within_lower_bound: do sequences start on block_boundaries?
+        """
+        # Create a lookup mapping from query indices -> request number
+        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            # Map query indices to corresponding request indices
+            q_req = request_lookup[q_idx]
+
+            # Convert physical KV indices to logical indices
+            physical_kv_block = physical_kv_idx // self.block_size
+            physical_kv_offset = physical_kv_idx % self.block_size
+            logical_block_idx = self.physical_to_logical[q_req,
+                                                         physical_kv_block]
+            logical_kv_idx = logical_block_idx * self.block_size + physical_kv_offset  # noqa: E501
+
+            # Determine valid kv indices
+            live_block = logical_block_idx >= 0
+            within_upper_bound = logical_kv_idx < self.seq_lens[q_req]
+            within_lower_bound = logical_kv_idx >= 0
+
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
+            # Convert physical query indices to logical indices
+            local_q_idx = q_idx - self.query_start_loc[q_req]
+            logical_q_idx = local_q_idx + self.decode_offset[q_req]
+
+            # Apply mask modification only for valid indices
+            return torch.where(
+                is_valid,
+                self.logical_mask_mod(b, h, logical_q_idx, logical_kv_idx),
+                False,
+            )
+
+        return final_mask_mod
+
+    def build_block_mask(self) -> BlockMask:
+        assert self.mask_mod is not None
+        return create_block_mask_compiled(
+            self.mask_mod,
+            None,
+            None,
+            self.num_actual_tokens,
+            self.total_cache_tokens,
+        )
+
+    def __post_init__(self):
+        assert self.use_cascade is False, "Not implemented yet."
+        assert self.common_prefix_len == 0, "Not implemented yet."
+        assert self.cu_prefix_query_lens is None, "Not implemented yet."
+        assert self.prefix_kv_lens is None, "Not implemented yet."
+        assert self.suffix_kv_lens is None, "Not implemented yet."
+        self.num_blocks = self.total_cache_tokens // self.block_size
+        self.mask_mod = self.get_mask_mod()
+        self.block_mask = self.build_block_mask()
+
+
+class FlexAttentionMetadataBuilder:
+
+    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable):
+        model_config = runner.model_config
+
+        self.runner = runner
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            runner.parallel_config)
+        self.headdim = model_config.get_head_size()
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_table = block_table
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
+        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+
+        block_table = self.block_table
+        block_table_tensor = block_table.get_device_tensor()[:num_reqs]
+        block_table.slot_mapping[:num_actual_tokens].copy_(
+            block_table.slot_mapping_cpu[:num_actual_tokens],
+            non_blocking=True)
+        slot_mapping = block_table.slot_mapping[:num_actual_tokens]
+
+        use_cascade = common_prefix_len > 0
+        cu_prefix_query_lens = None
+        prefix_kv_lens = None
+        suffix_kv_lens = None
+        if use_cascade:
+            raise NotImplementedError("Not yet my friend")
+
+        block_size = self.kv_cache_spec.block_size
+        max_possible_seq_len = self.runner.model_config.max_model_len
+        total_cache_tokens = (self.runner.cache_config.num_gpu_blocks *
+                              block_size)
+
+        inverse_block_table = physical_to_logical_mapping(
+            block_table_tensor, self.runner.cache_config.num_gpu_blocks)
+
+        # Get the original offset tensor
+        offset_tensor = torch.tensor(
+            self.runner.input_batch.num_computed_tokens_cpu[:num_reqs]).to(
+                self.runner.device, non_blocking=True)
+
+        out = FlexAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            block_size=block_size,
+            max_possible_sequence_length=max_possible_seq_len,
+            num_reqs=num_reqs,
+            physical_to_logical=inverse_block_table,
+            total_cache_tokens=total_cache_tokens,
+            decode_offset=offset_tensor,
+        )
+        return out
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
+
+class FlexAttentionImpl(AttentionImpl):
+    sliding_window: Optional[tuple[int, int]]
+    alibi_slopes: Optional[torch.Tensor]
+    logits_soft_cap: Optional[float]
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        if blocksparse_params is not None:
+            # TODO we should support this :think
+            raise ValueError(
+                "FlashAttention does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+
+        if alibi_slopes is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support alibi slopes yet.")
+        else:
+            self.alibi_slopes = None
+        if sliding_window is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support sliding window yet.")
+        else:
+            self.sliding_window = (-1, -1)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+        if self.logits_soft_cap is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support logits soft cap yet.")
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support kv sharing yet.")
+
+        support_head_sizes = FlexAttentionBackend.get_supported_head_sizes()
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}. "
+                "Set VLLM_USE_V1=0 to use another attention backend.")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlexAttention does not support quantized kv-cache. Yet")
+
+    @staticmethod
+    def view_as_4d(tensor: torch.Tensor) -> torch.Tensor:
+        """View a 3d tensor as 4D."""
+        if tensor.ndim == 4:
+            return tensor
+        assert tensor.ndim == 3
+        return tensor[None, :, :, :]
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlexAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FLexAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+        enable_gqa = self.num_kv_heads != self.num_heads
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+            # query = self.view_as_4d(query).permute(0, 2, 1, 3)
+            # return torch.empty_like(query)
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        # View out the block_size dim
+        key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
+        value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)
+        query, key_cache, value_cache = map(
+            lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
+            (query, key_cache, value_cache),
+        )
+        query = query[:, :, :num_actual_tokens, :]
+        # Doesn't work for now -> constraint violation
+        # torch._dynamo.try_mark_dynamic(query, 2)
+        out = flex_attention_compiled(
+            query,
+            key_cache,
+            value_cache,
+            attn_metadata.score_mod,
+            attn_metadata.block_mask,
+            self.scale,
+            enable_gqa=enable_gqa,
+            kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
+        )
+
+        # Flex doesn't have an out variant today, rely on epilogue fusion
+        out = out.permute(0, 2, 1, 3).squeeze(0)
+        output[:num_actual_tokens, :, :].copy_(out)
+        return output
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 1edfab26b6c12f015431bb2ea73706c78d976bf0..e6b4f6404632cb9368b500bea0c14aeb7fb911bd 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 # MLA Common Components
 
@@ -349,7 +350,7 @@ class MLACommonMetadataBuilder(Generic[M]):
         self.num_heads = model_config.get_num_attention_heads(
             runner.parallel_config)
         self.mla_dims = get_mla_dims(model_config)
-        self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3)
+        self.aot_schedule = current_platform.is_cuda()
         self.kv_cache_spec = kv_cache_spec
 
         # Dont try to access the runner on AMD
@@ -585,6 +586,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         blocksparse_params: Optional[dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
         # MLA Specific Arguments
         q_lora_rank: Optional[int],
         kv_lora_rank: int,
@@ -594,6 +596,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported for MLA")
+
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ec571989c682bf0b8fa531a17444f6cd6732c0
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+
+class CutlassMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "CUTLASS_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["CutlassMLAImpl"]:
+        return CutlassMLAImpl
+
+
+class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CutlassMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CutlassMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "CutlassMLA V1 with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+
+        B = q_nope.shape[0]
+
+        o = torch.empty((B, self.num_heads, self.kv_lora_rank),
+                        dtype=q_nope.dtype,
+                        device=q_nope.device)
+
+        # Run MLA
+        # Clone q_nope and q_pe to make sure strides computation is correct.
+        q_nope = q_nope.clone()
+        q_pe = q_pe.clone()
+        ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache,
+                               attn_metadata.decode.seq_lens,
+                               attn_metadata.decode.block_table, self.scale)
+
+        return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index e6594c6b6fa8c6b87ca20183a48ae99804f62c49..318b8ede1436619e9f30dd551e8ddd622ce5a478 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -92,12 +93,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 31980e94a03766e6872bd28a24975cf2dee18f04..1f0406a7ac1f8014b445ce63e92bb238d6a65ec8 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -66,9 +67,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     def __init__(self, runner, kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
         super().__init__(runner, kv_cache_spec, block_table)
-        max_model_len = self.runner.model_config.max_model_len
-        assert max_model_len == 32768,\
-            "AITER MLA requires max_model_len=32768"
         assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
             "only supports block size 1."
 
@@ -141,12 +139,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
         assert (num_heads == 16 or num_heads == 128), (
             f"Aiter MLA only supports 16 or 128 number of heads.\n"
             f"Provided {num_heads} number of heads.\n"
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 2e6b619db62876ab95372cc6d1d3dade99ada56d..e26d7909184b518f297de3c7409d66bb1d9f876b 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
@@ -40,12 +41,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 8187e457d9e616ce7bf298ccb2fc272145231305..0f956ba88b9c1769533e79d9481df1c851f4b394 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -112,6 +113,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
         use_irope: bool = False,
     ) -> None:
         if use_irope:
@@ -127,6 +129,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -180,7 +183,9 @@ class PallasAttentionBackendImpl(AttentionImpl):
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
-        if kv_cache.numel() > 0:
+        if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
+            # Write input keys and values to the KV cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
             slot_mapping = attn_metadata.slot_mapping
             write_to_kv_cache(key, value, kv_cache, slot_mapping)
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 4000f93984d39ffe92e79b10bec57d2de40d08e2..5db592b1501073e9549d068abe9f9225e87ae8af 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with PagedAttention and Triton prefix prefill."""
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.chunked_prefill_paged_decode import (
@@ -86,6 +88,7 @@ class TritonAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
         use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
@@ -107,6 +110,7 @@ class TritonAttentionImpl(AttentionImpl):
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         self.use_irope = use_irope
 
@@ -126,6 +130,8 @@ class TritonAttentionImpl(AttentionImpl):
                                       "TritonAttentionImpl")
 
         self.fp8_dtype = current_platform.fp8_dtype()
+        self.force_prefill_decode_attn = \
+            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
 
     def forward(
         self,
@@ -165,40 +171,40 @@ class TritonAttentionImpl(AttentionImpl):
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        num_queries_per_kv = query.shape[1] // key.shape[1]
-        use_prefill_decode_attn = (num_queries_per_kv &
-                                   (num_queries_per_kv - 1)) != 0
-
+        use_prefill_decode_attn = self.force_prefill_decode_attn
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
-
-            # Reshape the input keys and values and store them in the cache.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
         else:
             key_cache, value_cache = kv_cache.unbind(0)
-            torch.ops._C_cache_ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            if use_prefill_decode_attn:
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+            else:
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 10a771e830b68367851bde0b53d38207ce6fe66a..72c7643539273594429f898c7d3f0c3aa317379a 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import torch
@@ -16,3 +17,36 @@ class CommonAttentionMetadata:
     seq_lens: torch.Tensor
     """(batch_size,), the length of each request including both computed tokens
     and newly scheduled tokens"""
+
+
+def validate_kv_sharing_target(current_layer_name, target_layer_name,
+                               static_forward_context):
+    error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
+                 f"is not valid: target layer {target_layer_name} ")
+
+    if current_layer_name == target_layer_name:
+        raise ValueError(error_msg +
+                         "cannot be the same as the current layer.")
+
+    if target_layer_name not in static_forward_context:
+        from vllm.model_executor.models.utils import extract_layer_index
+
+        # If target layer name is not in the static fwd context, it means either
+        # a) the target layer does not come BEFORE the current layer, or
+        # b) the target layer is not an Attention layer that exists in the model
+        current_layer_idx = extract_layer_index(current_layer_name)
+        target_layer_idx = extract_layer_index(target_layer_name)
+        if current_layer_idx <= target_layer_idx:
+            raise ValueError(error_msg + "must come before the current layer.")
+        else:
+            raise ValueError(error_msg +
+                             "is not a valid Attention layer in the model.")
+
+    # Currently KV sharing is only supported between layers of the same type
+    target_layer_attn_type = static_forward_context[
+        target_layer_name].attn_type
+    expected = static_forward_context[current_layer_name].attn_type
+    if target_layer_attn_type != expected:
+        raise ValueError(
+            error_msg +
+            f"must be the same type as the current layer ({expected}).")
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index f2ed183b68fc87c0f3fcbe7256501494b9b3235b..d21f94727cf619e77feb46d975b58df05f7d8e03 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from collections.abc import Iterable
 from typing import Callable, Optional
@@ -6,8 +7,8 @@ from typing import Callable, Optional
 from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens)
 from vllm.v1.request import Request
@@ -26,6 +27,7 @@ class BlockPool:
     Args:
         num_gpu_blocks: The number of blocks in the pool.
         enable_caching: Whether to enable prefix caching.
+        enable_kv_cache_events: Whether to enable kv cache events.
     """
 
     def __init__(
@@ -55,42 +57,51 @@ class BlockPool:
         # if there is already an identical block in the cache. This is because
         # we want to make sure the allocated block IDs won't change so that
         # block tables are append-only.
-        self.cached_block_hash_to_block: dict[BlockHashType, dict[
+        self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
         # To represent a placeholder block with block_id=0.
         # The ref_cnt of null_block is not maintained, needs special care to
         # avoid freeing it.
         self.null_block = self.free_block_queue.popleft()
+        self.null_block.is_null = True
 
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
 
-    def get_cached_block(self,
-                         block_hash: BlockHashType) -> Optional[KVCacheBlock]:
-        """Get a cached block by the block hash, or None if cache miss.
+    def get_cached_block(
+            self, block_hash: BlockHash,
+            kv_cache_group_ids: list[int]) -> Optional[list[KVCacheBlock]]:
+        """Get the cached block by the block hash for each group in 
+        `kv_cache_group_ids`, or None if cache miss for any group.
         If there are duplicated blocks, we return the first block in the cache.
 
         Args:
             block_hash: The hash value of the block.
+            kv_cache_group_ids: The ids of the KV cache groups.
 
         Returns:
-            The cached block if it exists, or None.
+            The cached blocks if exists, or None.
         """
-        cached_blocks = self.cached_block_hash_to_block.get(block_hash)
-        if not cached_blocks:
-            return None
-        first_block_id = next(iter(cached_blocks))
-        return cached_blocks[first_block_id]
+        cached_blocks = []
+        for group_id in kv_cache_group_ids:
+            cached_blocks_one_group = self.cached_block_hash_to_block.get(
+                BlockHashWithGroupId(block_hash, group_id))
+            if not cached_blocks_one_group:
+                return None
+            first_block = next(iter(cached_blocks_one_group.values()))
+            cached_blocks.append(first_block)
+        return cached_blocks
 
     def cache_full_blocks(
         self,
         request: Request,
         blocks: list[KVCacheBlock],
-        block_hashes: list[BlockHashType],
+        block_hashes: list[BlockHash],
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
+        kv_cache_group_id: int,
         hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
@@ -110,6 +121,7 @@ class BlockPool:
             num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
+            kv_cache_group_id: The id of the KV cache group.
             hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
@@ -124,7 +136,7 @@ class BlockPool:
         else:
             prev_block = blocks[num_cached_blocks - 1]
             assert prev_block.block_hash is not None
-            prev_block_hash_value = prev_block.block_hash.hash_value
+            prev_block_hash_value = prev_block.block_hash.get_hash_value()
 
         parent_block_hash = prev_block_hash_value
         new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
@@ -136,8 +148,9 @@ class BlockPool:
                 # The block hash may already be computed in
                 # "get_computed_blocks" if the tokens are not generated by
                 # this request (either the prompt tokens or the previously
-                # generated tokens with preemption). In this case we simply
-                # reuse the block hash.
+                # generated tokens with preemption), or by other
+                # single_type_managers with the same block_size.
+                # In this case we simply reuse the block hash.
                 block_hash = new_block_hashes[i]
             else:
                 # Otherwise compute the block hash and cache it in the request
@@ -164,8 +177,11 @@ class BlockPool:
                 block_hashes.append(block_hash)
 
             # Update and added the full block to the cache.
-            blk.block_hash = block_hash
-            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            block_hash_with_group_id = BlockHashWithGroupId(
+                block_hash, kv_cache_group_id)
+            blk.block_hash = block_hash_with_group_id
+            self.cached_block_hash_to_block[block_hash_with_group_id][
+                blk.block_id] = blk
             if new_hashes is not None:
                 new_hashes.append(block_hash.hash_value)
             prev_block_hash_value = block_hash.hash_value
@@ -235,12 +251,16 @@ class BlockPool:
                 del self.cached_block_hash_to_block[block_hash]
 
             if self.enable_kv_cache_events:
+                # FIXME (Chen): Not sure whether we should return `hash_value`
+                # or `(hash_value, group_id)` here. But it's fine now because
+                # we disable hybrid kv cache manager when kv cache event is
+                # enabled, so there is only one group.
                 self.kv_event_queue.append(
-                    BlockRemoved(block_hashes=[block_hash.hash_value]))
+                    BlockRemoved(block_hashes=[block_hash.get_hash_value()]))
             return True
         return False
 
-    def touch(self, blocks: list[KVCacheBlock]) -> None:
+    def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -248,12 +268,13 @@ class BlockPool:
         Args:
             blocks: A list of blocks to touch.
         """
-        for block in blocks:
-            # ref_cnt=0 means this block is in the free list (i.e. eviction
-            # candidate), so remove it.
-            if block.ref_cnt == 0 and block != self.null_block:
-                self.free_block_queue.remove(block)
-            block.incr_ref()
+        for blocks_per_group in blocks:
+            for block in blocks_per_group:
+                # ref_cnt=0 means this block is in the free list (i.e. eviction
+                # candidate), so remove it.
+                if block.ref_cnt == 0 and not block.is_null:
+                    self.free_block_queue.remove(block)
+                block.incr_ref()
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
@@ -266,7 +287,7 @@ class BlockPool:
         for block in ordered_blocks:
             block.decr_ref()
             # null_block should not be added to the free list.
-            if block.ref_cnt == 0 and block != self.null_block:
+            if block.ref_cnt == 0 and not block.is_null:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
@@ -278,7 +299,7 @@ class BlockPool:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
+        num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
         if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 05d70bb9b97734eb198c9a2357fdb9441d828106..16dc67b9b6f6af30f44258562ca786d81c77d449 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5620d9bee7a3b821fc65c148ca8434f78293ffa1
--- /dev/null
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.single_type_kv_cache_manager import (
+    FullAttentionManager, get_manager_for_kv_cache_spec)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
+from vllm.v1.request import Request
+
+
+class KVCacheCoordinator(ABC):
+    """
+    Coordinate the KV cache of different KV cache groups.
+    """
+
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_caching: bool,
+        caching_hash_fn: Callable,
+        enable_kv_cache_events: bool,
+    ):
+        self.kv_cache_config = kv_cache_config
+        self.max_model_len = max_model_len
+
+        self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
+                                    enable_kv_cache_events)
+
+        # Needs special handling for find_longest_cache_hit if eagle is enabled
+        self.use_eagle = use_eagle
+        self.single_type_managers = tuple(
+            get_manager_for_kv_cache_spec(
+                kv_cache_spec=kv_cache_group.kv_cache_spec,
+                block_pool=self.block_pool,
+                kv_cache_group_id=i,
+                caching_hash_fn=caching_hash_fn,
+            ) for i, kv_cache_group in enumerate(
+                self.kv_cache_config.kv_cache_groups))
+
+    def get_num_blocks_to_allocate(
+            self, request_id: str, num_tokens: int,
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
+        """
+        Get the number of blocks needed to be allocated for the request.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+
+        Returns:
+            The number of blocks.
+        """
+        num_blocks_to_allocate = 0
+        for i, manager in enumerate(self.single_type_managers):
+            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                request_id, num_tokens, new_computed_blocks[i])
+        return num_blocks_to_allocate
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None:
+        """
+        Add the new computed blocks to the request.
+
+        Args:
+            request_id: The request ID.
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix cache.
+        """
+        for i, manager in enumerate(self.single_type_managers):
+            manager.save_new_computed_blocks(request_id,
+                                             new_computed_blocks[i])
+
+    def allocate_new_blocks(self, request_id: str,
+                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
+        """
+        Allocate new blocks for the request to give it at least `num_tokens` 
+        token slots.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+
+        Returns:
+            The new allocated blocks.
+        """
+        return tuple(
+            manager.allocate_new_blocks(request_id, num_tokens)
+            for manager in self.single_type_managers)
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_computed_tokens: int) -> None:
+        """
+        Cache the blocks for the request.
+
+        Args:
+            request: The request.
+            block_hashes: The block hashes of the request.
+            num_tokens: The total number of tokens that need to be cached 
+                (including tokens that are already cached).
+        """
+        for manager in self.single_type_managers:
+            manager.cache_blocks(request, block_hashes, num_computed_tokens)
+
+    def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
+        for manager in self.single_type_managers:
+            manager.free(request_id)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> list[int]:
+        """
+        Get the number of common prefix blocks for a request.
+
+        Args:
+            request_id: The request ID.
+            block_hashes: The block hashes of the request.
+
+        Returns:
+            The number of common prefix blocks.
+        """
+        num_blocks_per_group = [
+            manager.get_num_common_prefix_blocks(request_id,
+                                                 num_running_requests)
+            for manager in self.single_type_managers
+        ]
+        return num_blocks_per_group
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        """
+        Remove the blocks that are no longer needed from `blocks` and replace 
+        the removed blocks with null_block.
+
+        Args:
+            request_id: The request ID.
+            num_computed_tokens: The number of tokens that have been computed.
+        """
+        for manager in self.single_type_managers:
+            manager.remove_skipped_blocks(request_id, num_computed_tokens)
+
+    def get_blocks(self, request_id: str) -> tuple[list[KVCacheBlock], ...]:
+        """
+        Get the blocks for the request.
+        """
+        return tuple(
+            manager.req_to_blocks.get(request_id) or []
+            for manager in self.single_type_managers)
+
+    @abstractmethod
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        pass
+
+
+class UnitaryKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for models with only one KV cache group. This is the
+    case for models with only one KV cache type, e.g., all attention layers use
+    full attention or all attention layers use sliding window attention.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec
+        self.block_size = self.kv_cache_spec.block_size
+        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+            "UnitaryKVCacheCoordinator assumes only one kv cache group")
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
+            block_hashes=block_hashes,
+            max_length=max_cache_hit_length,
+            kv_cache_group_ids=[0],
+            block_pool=self.block_pool,
+            kv_cache_spec=self.kv_cache_spec,
+            use_eagle=self.use_eagle,
+        )
+        return hit_blocks, len(hit_blocks[0]) * self.block_size
+
+
+class HybridKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for hybrid models with multiple KV cache types, and
+    thus multiple kv cache groups.
+    To simplify `find_longest_cache_hit`, it only supports the combination of 
+    two types of KV cache groups, and one of them must be full attention.
+    May extend to more general cases in the future.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.verify_and_split_kv_cache_groups()
+
+    def verify_and_split_kv_cache_groups(self) -> None:
+        """
+        Verifies that the model has exactly two types of KV cache groups, and 
+        one of them is full attention. Then, split the kv cache groups into full
+        attention groups and other groups.
+        """
+        full_attention_type_id: Optional[str] = None
+        other_type_id: Optional[str] = None
+        self.full_attention_group_ids: list[int] = []
+        self.other_group_ids: list[int] = []
+        for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
+            if isinstance(g.kv_cache_spec, FullAttentionSpec):
+                if full_attention_type_id is None:
+                    full_attention_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert full_attention_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes exactly one type of "
+                        "full attention groups now.")
+                self.full_attention_group_ids.append(i)
+            else:
+                if other_type_id is None:
+                    other_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert other_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes "
+                        "exactly one other type of groups now.")
+                self.other_group_ids.append(i)
+
+        assert full_attention_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of full "
+            "attention groups now.")
+        assert other_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of other "
+            "groups now.")
+
+        self.full_attention_manager_cls = FullAttentionManager
+        self.other_attention_cls = self.single_type_managers[
+            self.other_group_ids[0]].__class__
+
+        self.full_attention_spec = self.kv_cache_config.kv_cache_groups[
+            self.full_attention_group_ids[0]].kv_cache_spec
+        self.other_spec = self.kv_cache_config.kv_cache_groups[
+            self.other_group_ids[0]].kv_cache_spec
+
+        self.full_attention_block_size = self.full_attention_spec.block_size
+        self.other_block_size = self.other_spec.block_size
+        assert self.other_block_size % self.full_attention_block_size == 0, (
+            "KVCacheCoordinator assumes the block_size of full attention "
+            "layers is divisible by other layers now.")
+
+        if max(self.full_attention_group_ids) < min(self.other_group_ids):
+            self.full_attn_first = True
+        elif max(self.other_group_ids) < min(self.full_attention_group_ids):
+            self.full_attn_first = False
+        else:
+            raise ValueError(
+                "HybridKVCacheCoordinator assumes the full "
+                "attention group ids and other attention group ids "
+                "do not interleave, either full attention group ids "
+                "are before other attention group ids or vice versa."
+                "This is for simplifying merging hit_blocks_full_attn and "
+                "hit_blocks_other_attn to hit_blocks.")
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        """
+        Find the longest cache hit for the request.
+
+        Args:
+            block_hashes: The block hashes of the request.
+            max_cache_hit_length: The maximum length of the cache hit.
+
+        Returns:
+            A tuple containing:
+                - A list of the cache hit blocks for each single type manager.
+                - The number of tokens of the longest cache hit.
+        """
+        # First, find the longest cache hit for full attention.
+        hit_blocks_full_attn = (
+            self.full_attention_manager_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=max_cache_hit_length,
+                kv_cache_group_ids=self.full_attention_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.full_attention_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(
+            hit_blocks_full_attn[0]) * self.full_attention_block_size
+
+        # Next, find the cache hit for the other attention WITHIN
+        # the cache hit of full attention.
+        hit_blocks_other_attn = (
+            self.other_attention_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=hit_length,
+                kv_cache_group_ids=self.other_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.other_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
+
+        # NOTE: the prefix cache hit length must be a multiple of block_size as
+        # we don't support partial block cache hit yet. The cache hit length
+        # of other attention is ensured to be a multiple of the block size of
+        # full attention layers in current implementation, because hit_length is
+        # a multiple of other attention's block size, and other attention's
+        # block size is a multiple of full attention's block size (verified in
+        # `verify_and_split_kv_cache_groups`).
+        assert hit_length % self.full_attention_block_size == 0
+
+        # Truncate the full attention cache hit to the length of the
+        # cache hit of the other attention.
+        for group_hit_blocks in hit_blocks_full_attn:
+            del group_hit_blocks[hit_length // self.full_attention_block_size:]
+
+        # Merge the hit blocks of full attention and other attention.
+        if self.full_attn_first:
+            hit_blocks = hit_blocks_full_attn + hit_blocks_other_attn
+        else:
+            hit_blocks = hit_blocks_other_attn + hit_blocks_full_attn
+        return hit_blocks, hit_length
+
+
+def get_kv_cache_coordinator(
+        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
+        enable_caching: bool, caching_hash_fn: Callable,
+        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+    if len(kv_cache_config.kv_cache_groups) == 1:
+        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
+                                         use_eagle, enable_caching,
+                                         caching_hash_fn,
+                                         enable_kv_cache_events)
+    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
+                                    enable_caching, caching_hash_fn,
+                                    enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 0f6098d2b4005b324205be3caf6be1fc9ce38eb3..2e09f4c0aacfc128995410f94e2bc24c656da585 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import defaultdict
 from dataclasses import dataclass
@@ -7,11 +8,9 @@ from typing import Optional
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.utils import sha256
-from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
+from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
+from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
                                          hash_request_tokens)
-from vllm.v1.core.single_type_kv_cache_manager import (
-    get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
@@ -21,34 +20,49 @@ logger = init_logger(__name__)
 
 @dataclass
 class KVCacheBlocks:
-    blocks: list[KVCacheBlock]
+    """
+    The allocation result of KVCacheManager, work as the interface between
+    Scheduler and KVCacheManager, to hide KVCacheManager's internal data
+    structure from the Scheduler.
+    """
+    blocks: tuple[list[KVCacheBlock], ...]
+    """
+    blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
+    We don't use block of tokens as the outer dimension because it assumes all
+    kv_cache_groups have the same number of blocks, which is true for now but 
+    will be broken if we want to give different block_size to different 
+    kv_cache_groups in the future.
+    """
 
     def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
         """Adds two KVCacheBlocks instances."""
-        return KVCacheBlocks(self.blocks + other.blocks)
+        return KVCacheBlocks(
+            tuple(blk1 + blk2
+                  for blk1, blk2 in zip(self.blocks, other.blocks)))
 
-    @classmethod
-    def create_empty(cls) -> "KVCacheBlocks":
-        """Creates a new KVCacheBlocks instance with no blocks."""
-        return cls([])
-
-    def get_block_ids(self) -> list[list[int]]:
+    def get_block_ids(self) -> tuple[list[int], ...]:
         """
         Converts the KVCacheBlocks instance to block_ids.
         
         Returns:
-            list[list[int]]: A two-level list where
-            * the outer list corresponds to KV cache groups (only 1 group now)
+            tuple[list[int], ...]: A tuple of lists where
+            * the outer tuple corresponds to KV cache groups
             * each inner list contains the block_ids of the blocks in that group
         """
-        return [[block.block_id for block in self.blocks]]
+        return tuple([blk.block_id for blk in group] for group in self.blocks)
 
     def get_unhashed_block_ids(self) -> list[int]:
         """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
+        assert len(self.blocks) == 1, "Only one group is supported"
         return [
-            block.block_id for block in self.blocks if block.block_hash is None
+            block.block_id for block in self.blocks[0]
+            if block.block_hash is None
         ]
 
+    def new_empty(self) -> "KVCacheBlocks":
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks(tuple([] for _ in range(len(self.blocks))))
+
 
 class KVCacheManager:
 
@@ -62,12 +76,6 @@ class KVCacheManager:
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
     ) -> None:
-        assert len(kv_cache_config.kv_cache_groups) == 1, (
-            "KVCacheManager does not support hybrid models with more than 1 "
-            "kv cache group")
-        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
-        self.block_size = kv_cache_spec.block_size
-        self.num_gpu_blocks = kv_cache_config.num_blocks
         self.max_model_len = max_model_len
 
         self.enable_caching = enable_caching
@@ -76,23 +84,30 @@ class KVCacheManager:
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
-
-        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching,
-                                    enable_kv_cache_events)
-
-        self.single_type_manager = get_manager_for_kv_cache_spec(
-            kv_cache_spec=kv_cache_spec,
-            block_pool=self.block_pool,
+        assert len(
+            set(g.kv_cache_spec.block_size
+                for g in kv_cache_config.kv_cache_groups)
+        ) == 1, "Only one block size is supported for now"
+        self.block_size = kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec.block_size
+
+        self.coordinator = get_kv_cache_coordinator(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
-            num_kv_cache_groups=1,
+            enable_caching=enable_caching,
             caching_hash_fn=self.caching_hash_fn,
+            enable_kv_cache_events=enable_kv_cache_events,
         )
+        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
+        self.block_pool = self.coordinator.block_pool
+        self.kv_cache_config = kv_cache_config
 
         # Mapping from request ID to kv block hashes.
         # This is to avoid recomputing the block hashes for each call of
         # `get_computed_blocks` or `allocate_slots`.
         self.req_to_block_hashes: defaultdict[
-            str, list[BlockHashType]] = defaultdict(list)
+            str, list[BlockHash]] = defaultdict(list)
 
     @property
     def usage(self) -> float:
@@ -132,7 +147,7 @@ class KVCacheManager:
         # When the request requires prompt logprobs, we skip prefix caching.
         if (not self.enable_caching
                 or request.sampling_params.prompt_logprobs is not None):
-            return KVCacheBlocks.create_empty(), 0
+            return self.create_empty_block_list(), 0
 
         # The block hashes for the request may already be computed
         # if the scheduler has tried to schedule the request before.
@@ -153,20 +168,16 @@ class KVCacheManager:
         # num_computed_tokens to be block-size aligned. Removing this limitation
         # could slightly improve performance in the future.
         max_cache_hit_length = request.num_tokens - 1
-
-        computed_blocks = self.single_type_manager.find_longest_cache_hit(
-            block_hashes, max_cache_hit_length)
-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
+        computed_blocks, num_new_computed_tokens = (
+            self.coordinator.find_longest_cache_hit(block_hashes,
+                                                    max_cache_hit_length))
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.queries += request.num_tokens
-            self.prefix_cache_stats.hits += num_computed_tokens
+            self.prefix_cache_stats.hits += num_new_computed_tokens
 
-        return KVCacheBlocks(computed_blocks), num_computed_tokens
+        return KVCacheBlocks(computed_blocks), num_new_computed_tokens
 
     def allocate_slots(
         self,
@@ -219,7 +230,8 @@ class KVCacheManager:
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
-            new_computed_block_list = []
+            new_computed_block_list = tuple(
+                [] for _ in range(len(self.kv_cache_config.kv_cache_groups)))
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -227,8 +239,8 @@ class KVCacheManager:
         # insufficient free blocks.
         # Should call this function before allocating new blocks to reduce
         # the number of evicted blocks.
-        self.single_type_manager.remove_skipped_blocks(
-            request.request_id, request.num_computed_tokens)
+        self.coordinator.remove_skipped_blocks(request.request_id,
+                                               request.num_computed_tokens)
 
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
@@ -237,12 +249,12 @@ class KVCacheManager:
         num_tokens_need_slot = min(
             num_computed_tokens + num_new_tokens + num_lookahead_tokens,
             self.max_model_len)
-        num_blocks_to_allocate = (
-            self.single_type_manager.get_num_blocks_to_allocate(
-                request_id=request.request_id,
-                num_tokens=num_tokens_need_slot,
-                new_computed_blocks=new_computed_block_list,
-            ))
+
+        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+            request_id=request.request_id,
+            num_tokens=num_tokens_need_slot,
+            new_computed_blocks=new_computed_block_list,
+        )
 
         if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
             # Cannot allocate new blocks
@@ -252,16 +264,16 @@ class KVCacheManager:
         if self.enable_caching:
             self.block_pool.touch(new_computed_block_list)
         else:
-            assert not new_computed_block_list, (
+            assert not any(new_computed_block_list), (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
         # Append the new computed blocks to the request blocks until now to
         # avoid the case where the new blocks cannot be allocated.
-        self.single_type_manager.save_new_computed_blocks(
-            request.request_id, new_computed_block_list)
+        self.coordinator.save_new_computed_blocks(request.request_id,
+                                                  new_computed_block_list)
 
-        new_blocks = self.single_type_manager.allocate_new_blocks(
+        new_blocks = self.coordinator.allocate_new_blocks(
             request.request_id, num_tokens_need_slot)
 
         # P/D: delay caching blocks if we have to recv from
@@ -272,7 +284,7 @@ class KVCacheManager:
         # Speculated tokens might be rejected in the future, so we does
         # not cache any speculated tokens. We only cache blocks with
         # generated (accepted) tokens.
-        self.single_type_manager.cache_blocks(
+        self.coordinator.cache_blocks(
             request, self.req_to_block_hashes[request.request_id],
             num_computed_tokens + num_new_tokens - num_draft_tokens)
 
@@ -286,7 +298,7 @@ class KVCacheManager:
         Args:
             request: The request to free the blocks.
         """
-        self.single_type_manager.free(request.request_id)
+        self.coordinator.free(request.request_id)
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
@@ -344,10 +356,8 @@ class KVCacheManager:
             group.
         """
         assert request.status == RequestStatus.RUNNING
-        return [
-            self.single_type_manager.get_num_common_prefix_blocks(
-                request.request_id, num_running_requests)
-        ]
+        return self.coordinator.get_num_common_prefix_blocks(
+            request.request_id, num_running_requests)
 
     def free_block_hashes(self, request: Request) -> None:
         """Discard the block hashes for the request.
@@ -365,8 +375,18 @@ class KVCacheManager:
         """
         return self.block_pool.take_events()
 
-    def get_block_ids(self, request_id: str) -> list[list[int]]:
+    def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
         """Get the block ids of a request."""
-        assert request_id in self.single_type_manager.req_to_blocks
-        return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id]
-                             ).get_block_ids()
+        return KVCacheBlocks(
+            self.coordinator.get_blocks(request_id)).get_block_ids()
+
+    def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
+        """Cache the blocks for the request."""
+        block_hashes = self.req_to_block_hashes[request.request_id]
+        self.coordinator.cache_blocks(request, block_hashes,
+                                      num_computed_tokens)
+
+    def create_empty_block_list(self) -> KVCacheBlocks:
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks(tuple([]
+                                   for _ in range(self.num_kv_cache_groups)))
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 403b5401be75a7b0503a40750f0339d4589f0614..6d4bcfe64a35791c970f88d2267fb9c9457a5ff4 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """KV-Cache Utilities."""
+
 import os
-from collections import deque
-from collections.abc import Sequence
+from collections import defaultdict, deque
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
 from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, sha256
+from vllm.utils import GiB_bytes, cdiv, sha256
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         KVCacheTensor, SlidingWindowSpec)
@@ -18,7 +20,7 @@ from vllm.v1.request import Request
 logger = init_logger(__name__)
 
 
-class BlockHashType(NamedTuple):
+class BlockHash(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
     We keep a tuple of token IDs and extra keys to reduce the likelihood of
     hash collisions when the hash value is the same. By using SHA256 however,
@@ -32,6 +34,18 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+class BlockHashWithGroupId(NamedTuple):
+    # The hash value for the contents (e.g., token_ids) of a block without group
+    # ID. The value is the same for blocks representing the same tokens but for
+    # different groups.
+    block_hash: BlockHash
+    # The KV cache group ID.
+    group_id: int
+
+    def get_hash_value(self) -> int:
+        return self.block_hash.hash_value
+
+
 # The hash seed for the first block of the prefix block sequence.
 #
 # Even if the hash function is the builtin hash(), we use sha256 to generate
@@ -43,7 +57,7 @@ class BlockHashType(NamedTuple):
 # This aligns with the behavior of Python's hash() function, which also uses
 # a random seed if PYTHONHASHSEED is not set.
 NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
-    'PYTHONHASHSEED') is None else sha256(os.getenv('PYTHONHASHSEED'))
+    "PYTHONHASHSEED") is None else sha256(os.getenv("PYTHONHASHSEED"))
 
 
 class PrefixCachingMetrics:
@@ -117,13 +131,16 @@ class KVCacheBlock:
     ref_cnt: int = 0
     # The hash of the block composed of (block hash, tuple of token IDs).
     # It is only available when the block is full.
-    _block_hash: Optional[BlockHashType] = None
+    _block_hash: Optional[BlockHashWithGroupId] = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
     prev_free_block: Optional["KVCacheBlock"] = None
     next_free_block: Optional["KVCacheBlock"] = None
 
+    # Whether the block is a null block that should never be cached.
+    is_null: bool = False
+
     def incr_ref(self):
         self.ref_cnt += 1
 
@@ -131,11 +148,11 @@ class KVCacheBlock:
         self.ref_cnt -= 1
 
     @property
-    def block_hash(self) -> Optional[BlockHashType]:
+    def block_hash(self) -> Optional[BlockHashWithGroupId]:
         return self._block_hash
 
     @block_hash.setter
-    def block_hash(self, block_hash: BlockHashType):
+    def block_hash(self, block_hash: BlockHashWithGroupId):
         assert self.block_hash is None, (
             "The block already has a hash. This should not happen.")
         self._block_hash = block_hash
@@ -147,10 +164,10 @@ class KVCacheBlock:
     def __repr__(self) -> str:
         # Use block_id instead of KVCacheBlock object to avoid calling __repr__
         # on KVCacheBlock object recursively.
-        prev_block_id = self.prev_free_block.block_id \
-            if self.prev_free_block else None
-        next_block_id = self.next_free_block.block_id \
-            if self.next_free_block else None
+        prev_block_id = (self.prev_free_block.block_id
+                         if self.prev_free_block else None)
+        next_block_id = (self.next_free_block.block_id
+                         if self.next_free_block else None)
         return (f"KVCacheBlock(block_id={self.block_id}, "
                 f"ref_cnt={self.ref_cnt}, "
                 f"_block_hash={self._block_hash}, "
@@ -398,7 +415,7 @@ def hash_block_tokens(
         hash_function: Callable,
         parent_block_hash: Optional[int],
         curr_block_token_ids: Sequence[int],
-        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
+        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -419,14 +436,14 @@ def hash_block_tokens(
         parent_block_hash = NONE_HASH
 
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
-    return BlockHashType(
+    return BlockHash(
         hash_function(
             (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
         curr_block_token_ids_tuple, extra_keys)
 
 
 def hash_request_tokens(hash_function: Any, block_size: int,
-                        request: Request) -> list[BlockHashType]:
+                        request: Request) -> list[BlockHash]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
@@ -464,6 +481,15 @@ def hash_request_tokens(hash_function: Any, block_size: int,
     return ret
 
 
+def max_memory_usage_bytes(vllm_config: VllmConfig,
+                           kv_cache_specs: Iterable[KVCacheSpec]) -> int:
+    """
+    Get the maximum memory usage in bytes for the given KV cache specs.
+    """
+    return sum(
+        spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
+
+
 def estimate_max_model_len(vllm_config: VllmConfig,
                            kv_cache_spec: dict[str, KVCacheSpec],
                            available_memory: int) -> int:
@@ -485,11 +511,8 @@ def estimate_max_model_len(vllm_config: VllmConfig,
         # Modify the max_model_len for this calculation
         vllm_config.model_config.max_model_len = model_len
         # Calculate memory needed for the given model length
-        memory_needed = sum(
-            (layer_spec.max_memory_usage_bytes(vllm_config)
-             for layer_spec in kv_cache_spec.values()),
-            start=0,
-        )
+        memory_needed = max_memory_usage_bytes(vllm_config,
+                                               kv_cache_spec.values())
         return memory_needed <= available_memory
 
     # Binary search for the maximum model length
@@ -534,9 +557,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                          "initializing the engine.")
 
     max_model_len = vllm_config.model_config.max_model_len
-    needed_memory = 0
-    for layer_spec in kv_cache_spec.values():
-        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
+    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
 
     if needed_memory > available_memory:
         # Estimate the maximum model length that can fit in the available memory
@@ -544,16 +565,17 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                                                    available_memory)
         estimated_msg = ""
         if estimated_max_len > 0:
-            estimated_msg = " Based on the available memory,"
-            f" the estimated maximum model length is {estimated_max_len}."
+            estimated_msg = (
+                "Based on the available memory, "
+                f"the estimated maximum model length is {estimated_max_len}.")
 
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
             f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/GiB_bytes:.2f} GiB)."
+            f"memory ({available_memory/GiB_bytes:.2f} GiB). "
             f"{estimated_msg} "
-            f" Try increasing `gpu_memory_utilization` or decreasing "
+            f"Try increasing `gpu_memory_utilization` or decreasing "
             f"`max_model_len` when initializing the engine.")
 
 
@@ -561,20 +583,20 @@ def create_kv_cache_group_specs(
         kv_cache_spec: dict[str, KVCacheSpec],
         grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
     """
-     Create KVCacheGroupSpec object for each kv cache group layer.
-     The layers in the same group should share the same
-     KVCacheSpec.
-
-     Args:
-         kv_cache_spec:
-             A mapping from each layer name to its corresponding KVCacheSpec.
-         grouped_layer_names:
-             A list of kv cache groups, where each element is a list of layer
-             names that belong to the same group and should share the same
-             KVCacheSpec.
-     Returns:
-         A list of KVCacheGroupSpec objects, one for each group.
-     """
+    Create KVCacheGroupSpec object for each kv cache group layer.
+    The layers in the same group should share the same
+    KVCacheSpec.
+
+    Args:
+        kv_cache_spec:
+            A mapping from each layer name to its corresponding KVCacheSpec.
+        grouped_layer_names:
+            A list of kv cache groups, where each element is a list of layer
+            names that belong to the same group and should share the same
+            KVCacheSpec.
+    Returns:
+        A list of KVCacheGroupSpec objects, one for each group.
+    """
     kv_cache_groups = []
     for layer_names_one_group in grouped_layer_names:
         layer_specs = [
@@ -601,6 +623,55 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     return len(layer_keys) == 1
 
 
+def get_max_concurrency_for_kv_cache_config(
+        vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float:
+    """
+    Get the maximum concurrency for the given KV cache configuration.
+    """
+    num_layer_per_group = max(
+        len(group.layer_names) for group in kv_cache_config.kv_cache_groups)
+    max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
+        vllm_config,
+        (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups))
+    memory_per_block = kv_cache_config.kv_cache_groups[
+        0].kv_cache_spec.page_size_bytes * num_layer_per_group
+    num_block_per_request = cdiv(max_memory_usage_per_request,
+                                 memory_per_block)
+    max_concurrency = kv_cache_config.num_blocks / num_block_per_request
+    return max_concurrency
+
+
+def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
+                   available_memory: int, page_size: int) -> int:
+    """
+    Get the number of kv cache blocks.
+
+    Args:
+        vllm_config: The global VllmConfig
+        num_layers: The number of layers
+        available_memory: Memory available for KV cache in bytes.
+        page_size: The page size of the KV cache.
+    """
+    num_blocks = int(available_memory // page_size // num_layers)
+    num_blocks = max(num_blocks, 0)
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+    return num_blocks
+
+
+def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+    """
+    Get the page size of the KV cache.
+    """
+    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    assert len(page_sizes) == 1
+    return page_sizes.pop()
+
+
 def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
                                       kv_cache_spec: dict[str, KVCacheSpec],
                                       available_memory: int) -> KVCacheConfig:
@@ -617,57 +688,218 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         The generated KVCacheConfig
     """
 
-    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
-    assert len(page_sizes) == 1
-    page_size = page_sizes.pop()
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec),
+                                available_memory, page_size)
 
-    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
-    num_blocks = max(num_blocks, 0)
+    per_layer_size = page_size * num_blocks
+    # All layers have the same KV cache spec, so we create one kv cache group
+    # for all layers.
+    grouped_layer_names = [list(kv_cache_spec.keys())]
 
-    if vllm_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = \
-            vllm_config.cache_config.num_gpu_blocks_override
-        logger.info(
-            "Overriding num_gpu_blocks=%d with "
-            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
-        num_blocks = num_gpu_blocks_override
+    # Each layer uses a separate Tensor to store its KV cache.
+    kv_cache_tensors = [
+        KVCacheTensor(size=per_layer_size, shared_by=[layer_name])
+        for layer_name in kv_cache_spec
+    ]
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
+                                                    grouped_layer_names),
+    )
 
     num_tokens = num_blocks * vllm_config.cache_config.block_size
     num_tokens_str = f"{num_tokens:,}"
     logger.info("GPU KV cache size: %s tokens", num_tokens_str)
     max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
-    max_concurrency = num_tokens / vllm_config.model_config.max_model_len
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
     logger.info("Maximum concurrency for %s tokens per request: %.2fx",
                 max_model_len_str, max_concurrency)
+    return kv_cache_config
 
-    per_layer_size = page_size * num_blocks
-    # All layers have the same KV cache spec, so we create one kv cache group
-    # for all layers.
-    grouped_layer_names = [list(kv_cache_spec.keys())]
+
+def is_kv_cache_page_size_uniform(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same page size.
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        True if all layers have the same page size, False otherwise.
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    return len(page_sizes) == 1
+
+
+def _get_kv_cache_config_uniform_page_size(
+        vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
+        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for hybrid models with multiple 
+    attention types but still with a uniform page size (physical memory per 
+    block per layer) for all layers.
+
+    Detailed explanation about kv cache management of hybrid models:
+    The layers in the models are repeated with some patterns, e.g., a model
+    with 10 full attention layers and 20 sliding window attention layers can be
+    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
+    The KVCacheManager allocates different block tables for each of the 3 layers
+    in the pattern, and repeats each of them 10 times to generate the 
+    block_table for the 30 layers in the model.
+    Therefore, we can group the layers in the model into 3 kv_cache_groups, each
+    of which contains 10 layers in the model.
+    The KVCacheManager allocates the block_table for each group based on its
+    kv_cache spec, and the model runner applies the block table to each layer 
+    in the group.
+    For example:
+    1. A model only uses full attention. The pattern is 
+    (num_hidden_layers * full), so there is only one group and the block table 
+    is shared by all layers. It is already handled by 
+    `_get_kv_cache_config_uniform_type`.
+    2. A model with 10 full attention layers and 20 sliding window 
+    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
+    there are 3 kv_cache_groups, each of which represents 10 layers.
+
+    To simplify the implementation, we make the following assumptions:
+    1. Physical memory per block: Must be the same across all KV cache groups. 
+    Breaking this assumption is non-trivial due to memory fragmentation concerns
+    when allocating blocks of different sizes.
+    2. Tokens per block (block_size): Currently, we directly use 
+    `CacheConfig.block_size` for all layers. It can be extended to vary by KV 
+    cache group, but within each KV cache group, all layers must share the same 
+    block size.
+    3. Physical memory per token per layer: This property is decided by model 
+    config. Currently we only support models that have the same physical memory 
+    per token per layer for all layers. Can be relaxed with a simple extension, 
+    but still need to keep physical memory per block the same for all groups.
+    4. Number of layers per group: Currently assumed the same for all layers. 
+    Can be relaxed with a simple extension, but still need to keep physical 
+    memory per block the same for all groups.
+    5. Attention type within groups: All layers in a group must share the same
+    attention type. One exception is that, when 
+    `--disable-hybrid-kv-cache-manager` is true, the single group for full 
+    attention layers may also include attention layers using sliding window or 
+    LLaMA 4 local attention. See `unify_hybrid_kv_cache_specs` for more details.
+    6. Support for multiple attention types: The design for most components is 
+    general to an arbitrary number of attention types. But 
+    `find_longest_cache_hit` only supports one attention type or two 
+    types of full-attention plus exactly one another type. The general
+    implementation of this function is feasible but we don't know how to 
+    implement it cleanly yet.
+
+    As we assume tokens per block, physical memory per token per layer, and 
+    number of layers per group are the same now, we can ensure that physical 
+    memory per block is the same for all groups.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+    Returns:
+        The generated KVCacheConfig
+    """
+    # Group all layers by type_id.
+    # E.g., 2 full attention layers and 3 sliding window attention layers,
+    # -> (full.0, full.1), (sw.0, sw.1, sw.2).
+    same_type_layers: dict[str, list[str]] = defaultdict(list)
+    for layer_name, layer_spec in kv_cache_spec.items():
+        same_type_layers[layer_spec.type_id].append(layer_name)
+
+    # Split each group into smaller groups, to make the number of layers in each
+    # group identical. Add padding to the last group of each type if necessary.
+    # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
+    # split to 3 groups with 2 layers each:
+    # (full.0, full.1), (sw.0, sw.1), (sw.2, padding).
+    # FIXME(Chen): At the moment of writing this code (2025-06-02), all
+    # open-source hybrid model follows a n:1 pattern between different attention
+    # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
+    # full), so we can use the "1" in the n:1 pattern as the group size, which
+    # is the minimum number of layers among all attention types. Need a better
+    # strategy if we want to support more complex patterns (e.g., 20 full + 30
+    # sw, where the group size should be 10).
+    group_size = min([len(layers) for layers in same_type_layers.values()])
+    grouped_layers = []
+    for layers in same_type_layers.values():
+        num_padding_layers = group_size - len(layers) % group_size
+        if num_padding_layers != group_size:
+            logger.warning(
+                "Add %d padding layers, may waste at most %.2f%% KV cache memory",  # noqa
+                num_padding_layers,
+                num_padding_layers / len(layers) * 100,
+            )
+        for i in range(0, len(layers), group_size):
+            grouped_layers.append(layers[i:i + group_size])
+    kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec,
+                                                  grouped_layers)
+
+    # Determine how model runners should initialize the KV cache tensors.
+    # We will have group_size memory pools, each is shared by one layer from
+    # each group. As layers of different groups have different block table,
+    # they will use different parts of the shared Tensor.
+    # The memory layout in the example will be:
+    # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2
+    # full.1, sw.1: share another Tensor with size=available_memory//2
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
+                                page_size)
+    per_memory_pool_size = page_size * num_blocks
+    kv_cache_tensors = []
+    for i in range(group_size):
+        shared_by = []
+        for j in range(len(kv_cache_groups)):
+            if i < len(grouped_layers[j]):
+                shared_by.append(grouped_layers[j][i])
+        kv_cache_tensors.append(
+            KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
 
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,
-        tensors={
-            layer_name: KVCacheTensor(size=per_layer_size)
-            for layer_name in kv_cache_spec
-        },
-        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
-                                                    grouped_layer_names),
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=kv_cache_groups,
     )
+
+    # Print the KV cache size and maximum concurrency.
+    num_tokens = num_blocks // len(
+        grouped_layers) * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
     return kv_cache_config
 
 
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
-    Only models with one type of KV cache are supported yet. This function tries
-    to convert the KV cache specs to one type if the model is a hybrid model
-    with multiple type of KV cache. It will convert all SlidingWindowSpec to
-    FullAttentionSpec if both types are present.
+    This function tries to convert the KV cache specs to one type if the model
+    is a hybrid model with multiple type of KV cache. It will convert all
+    SlidingWindowSpec to FullAttentionSpec if both types are present.
 
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
 
+    def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+        type_ids = set(layer_spec.type_id
+                       for layer_spec in kv_cache_spec.values())
+        return len(type_ids) > 1
+
+    if not is_hybrid(kv_cache_spec):
+        return
+
+    logger.warning(
+        "Hybrid KV cache manager is disabled for this hybrid model, "
+        "This means we do not enable any optimizations for saving KV cache "
+        "memory (e.g., dropping the KV cache outside the sliding window). "
+        "The compute of layers like sliding window is still saved.")
+
     has_full_attention = any(
         isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
     has_sliding_window = any(
@@ -684,13 +916,18 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     sliding_window=spec.sliding_window,
                 )
 
+    if is_hybrid(kv_cache_spec):
+        raise ValueError("Hybrid KV cache manager is disabled but failed to "
+                         "convert the KV cache specs to one unified type.")
+
 
-def get_kv_cache_config(vllm_config: VllmConfig,
-                        kv_cache_spec: dict[str, KVCacheSpec],
-                        available_memory: int) -> KVCacheConfig:
+def get_kv_cache_config(
+    vllm_config: VllmConfig,
+    kv_cache_spec: dict[str, KVCacheSpec],
+    available_memory: int,
+) -> KVCacheConfig:
     """
-    Generates the KV cache configuration for a model
-    TODO: support hybrid models with more than one type of KV cache.
+    Generates the KV cache configuration for a model.
 
     Args:
         vllm_config: The global VllmConfig
@@ -701,13 +938,24 @@ def get_kv_cache_config(vllm_config: VllmConfig,
         The generated KVCacheConfigs
     """
     check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
-    unify_hybrid_kv_cache_specs(kv_cache_spec)
+
+    if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
+        unify_hybrid_kv_cache_specs(kv_cache_spec)
+
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
         return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
                                                  available_memory)
+    elif is_kv_cache_page_size_uniform(kv_cache_spec):
+        # Model contains multiple attention types, but KV cache of all layers
+        # have the same physical memory per block per layer. Split the layers
+        # into groups with the same number of layers, and thus same total page
+        # size.
+        return _get_kv_cache_config_uniform_page_size(vllm_config,
+                                                      kv_cache_spec,
+                                                      available_memory)
 
     raise NotImplementedError
 
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index c17f80b6ae78a586177fa89a698692bd1ae18f10..dd5052a3480b79de11b9f82263ca2f045daa9b3f 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional, Union
@@ -45,7 +46,7 @@ class SchedulerInterface(ABC):
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> "EngineCoreOutputs":
+    ) -> dict[int, "EngineCoreOutputs"]:
         """Update the scheduler state based on the model runner output.
 
         This method is called after the model runner has processed the scheduled
@@ -55,7 +56,8 @@ class SchedulerInterface(ABC):
         for each request.
 
         Returns:
-            A EngineCoreOutputs object containing the outputs for each request.
+            A dict of client index to EngineCoreOutputs object containing the
+            outputs for each request originating from that client.
         """
         raise NotImplementedError
 
@@ -126,6 +128,11 @@ class SchedulerInterface(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        raise NotImplementedError
+
     @abstractmethod
     def make_stats(self) -> Optional["SchedulerStats"]:
         """Make a SchedulerStats object for logging.
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 257234430983739968cbf91306f81159440e8014..9b0a439fe7dcdb59adcc4df4110a3f46afc24679 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
@@ -26,7 +27,7 @@ class NewRequestData:
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
-    block_ids: list[list[int]]
+    block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     lora_request: Optional[LoRARequest]
 
@@ -34,7 +35,7 @@ class NewRequestData:
     def from_request(
         cls,
         request: Request,
-        block_ids: list[list[int]],
+        block_ids: tuple[list[int], ...],
     ) -> NewRequestData:
         return cls(
             req_id=request.request_id,
@@ -85,7 +86,7 @@ class CachedRequestData:
     # request's block IDs instead of appending to the existing block IDs.
     resumed_from_preemption: bool
     new_token_ids: list[int]
-    new_block_ids: list[list[int]]
+    new_block_ids: tuple[list[int], ...]
     num_computed_tokens: int
 
     @classmethod
@@ -94,7 +95,7 @@ class CachedRequestData:
         request: Request,
         resumed_from_preemption: bool,
         new_token_ids: list[int],
-        new_block_ids: list[list[int]],
+        new_block_ids: tuple[list[int], ...],
     ) -> CachedRequestData:
         return cls(
             req_id=request.request_id,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4c6b3eea0cb759dc35334ba48e23791652d94ba6..3d7bbe7e0e396d8d25c1537bf4db9a3752c0a576 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
@@ -17,7 +18,7 @@ from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
@@ -58,7 +59,8 @@ class Scheduler(SchedulerInterface):
         # request ids should be included in the EngineCoreOutputs returned
         # by update_from_outputs(). This is currently used in the multi-engine
         # case to track request lifetimes efficiently.
-        self.include_finished_set = include_finished_set
+        self.finished_req_ids_dict: Optional[dict[int, set[str]]] = (
+            defaultdict(set) if include_finished_set else None)
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -74,11 +76,16 @@ class Scheduler(SchedulerInterface):
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
         if self.vllm_config.kv_transfer_config is not None:
+            assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+                "Multiple KV cache groups are not currently supported "
+                "with KV connectors")
             self.connector = KVConnectorFactory.create_connector_v1(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
         self.kv_event_publisher = EventPublisherFactory.create(
-            self.kv_events_config)
+            self.kv_events_config,
+            vllm_config.parallel_config.data_parallel_rank,
+        )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
@@ -97,7 +104,7 @@ class Scheduler(SchedulerInterface):
         # This is flushed at the end of each scheduling step.
         self.finished_req_ids: set[str] = set()
 
-        # P/D: requests in process of recving KV transfers
+        # KV Connector: requests in process of async KV loading or recving
         self.finished_recving_kv_req_ids: set[str] = set()
 
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -173,7 +180,7 @@ class Scheduler(SchedulerInterface):
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
 
-        req_to_new_block_ids: dict[str, list[list[int]]] = {}
+        req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
@@ -373,7 +380,8 @@ class Scheduler(SchedulerInterface):
                 # KVTransfer: WAITING reqs have num_computed_tokens > 0
                 # after async KV recvs are completed.
                 else:
-                    new_computed_blocks = KVCacheBlocks.create_empty()
+                    new_computed_blocks = (
+                        self.kv_cache_manager.create_empty_block_list())
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
 
@@ -420,11 +428,11 @@ class Scheduler(SchedulerInterface):
                     # The request cannot be scheduled.
                     break
 
-                # KVConnector: update internal state after allocation.
+                # KVTransfer: the connector uses this info to determine
+                # if a load is needed. Note that
                 # This information is used to determine if a load is
                 # needed for this request.
-                if num_external_computed_tokens:
-                    assert self.connector is not None
+                if self.connector is not None:
                     self.connector.update_state_after_alloc(
                         request,
                         new_computed_blocks + new_blocks,
@@ -463,7 +471,7 @@ class Scheduler(SchedulerInterface):
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                # Count the number of prifix cached tokens.
+                # Count the number of prefix cached tokens.
                 if request.num_cached_tokens < 0:
                     request.num_cached_tokens = num_computed_tokens
                 # Encoder-related.
@@ -580,7 +588,7 @@ class Scheduler(SchedulerInterface):
         request: Request,
         num_scheduled_tokens: int,
         num_scheduled_spec_tokens: int,
-        new_block_ids: list[list[int]],
+        new_block_ids: tuple[list[int], ...],
         resumed_from_preemption: bool,
     ) -> CachedRequestData:
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -693,7 +701,7 @@ class Scheduler(SchedulerInterface):
         self,
         scheduler_output: SchedulerOutput,
         model_runner_output: ModelRunnerOutput,
-    ) -> EngineCoreOutputs:
+    ) -> dict[int, EngineCoreOutputs]:
         sampled_token_ids = model_runner_output.sampled_token_ids
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
@@ -701,7 +709,7 @@ class Scheduler(SchedulerInterface):
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
-        outputs: list[EngineCoreOutput] = []
+        outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
         spec_decoding_stats: Optional[SpecDecodingStats] = None
 
         # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
@@ -797,7 +805,7 @@ class Scheduler(SchedulerInterface):
             if new_token_ids or kv_transfer_params:
 
                 # Add EngineCoreOutput for this Request.
-                outputs.append(
+                outputs[request.client_index].append(
                     EngineCoreOutput(
                         request_id=req_id,
                         new_token_ids=new_token_ids,
@@ -817,7 +825,7 @@ class Scheduler(SchedulerInterface):
             if not stopped:
                 new_running.append(request)
 
-        # P/D: update state for finished KV Transfers.
+        # KV Connector: update state for finished KV Transfers.
         self._update_from_kv_xfer_finished(model_runner_output)
 
         # Return the cached request data to the queue so they can be reused.
@@ -828,17 +836,38 @@ class Scheduler(SchedulerInterface):
                 self._cached_reqs_data[req_data.req_id].append(req_data)
 
         self.running = new_running
-        engine_core_outputs = EngineCoreOutputs(
-            outputs=outputs,
-            scheduler_stats=self.make_stats(spec_decoding_stats),
-        )
-        if self.include_finished_set:
-            #TODO currently sending duplicates here, improve this
-            engine_core_outputs.finished_requests = (
-                scheduler_output.finished_req_ids | self.finished_req_ids)
+
+        # Create EngineCoreOutputs for all clients that have requests with
+        # outputs in this step.
+        engine_core_outputs = {
+            client_index: EngineCoreOutputs(outputs=outs)
+            for client_index, outs in outputs.items()
+        }
+
+        finished_req_ids = self.finished_req_ids_dict
+        if finished_req_ids:
+            # Include ids of requests that finished since last outputs
+            # were sent.
+            for client_index, finished_set in finished_req_ids.items():
+                # Set finished request set in EngineCoreOutputs for this client.
+                if (eco := engine_core_outputs.get(client_index)) is not None:
+                    eco.finished_requests = finished_set
+                else:
+                    engine_core_outputs[client_index] = EngineCoreOutputs(
+                        finished_requests=finished_set)
+            finished_req_ids.clear()
+
+        if engine_core_outputs:
+            # Return stats to only one of the front-ends.
+            next(iter(engine_core_outputs.values())).scheduler_stats = (
+                self.make_stats(spec_decoding_stats))
 
         return engine_core_outputs
 
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        return len(self.running), len(self.waiting)
+
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
@@ -880,8 +909,11 @@ class Scheduler(SchedulerInterface):
 
         delay_free_blocks, kv_xfer_params = self._connector_finished(request)
         self.encoder_cache_manager.free(request)
-        self._cached_reqs_data.pop(request.request_id, None)
-        self.finished_req_ids.add(request.request_id)
+        request_id = request.request_id
+        self._cached_reqs_data.pop(request_id, None)
+        self.finished_req_ids.add(request_id)
+        if self.finished_req_ids_dict is not None:
+            self.finished_req_ids_dict[request.client_index].add(request_id)
 
         if not delay_free_blocks:
             self._free_blocks(request)
@@ -940,7 +972,7 @@ class Scheduler(SchedulerInterface):
             self.kv_event_publisher.shutdown()
 
     ########################################################################
-    # P/D Related Methods
+    # KV Connector Related Methods
     ########################################################################
 
     def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
@@ -956,14 +988,13 @@ class Scheduler(SchedulerInterface):
         """
         if self.connector is None:
             return False, None
-        assert len(self.kv_cache_config.kv_cache_groups
-                   ) == 1, "KV connector only supports one KV cache group now"
-        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
+
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         return self.connector.request_finished(request, block_ids)
 
     def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         """
-        P/D: check if the request_id is finished_recving.
+        KV Connector: check if the request_id is finished_recving.
 
         The finished_recving_kv_req_ids list is populated
         on the previous steps()'s update_from_output based
@@ -973,20 +1004,18 @@ class Scheduler(SchedulerInterface):
         and the request state will be moved back to WAITING from
         WAITING_FOR_REMOTE_KV.
         """
+        assert self.connector is not None
         if request.request_id not in self.finished_recving_kv_req_ids:
             return False
-        assert len(self.kv_cache_config.kv_cache_groups
-                   ) == 1, "KV connector only supports one KV cache group now"
+
         # Now that the blocks are ready, actually cache them.
-        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         num_computed_tokens = len(block_ids) * self.block_size
+        # Handle the case where num request tokens less then one block.
+        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
-        self.kv_cache_manager.single_type_manager.cache_blocks(
-            request,
-            self.kv_cache_manager.req_to_block_hashes[request.request_id],
-            num_computed_tokens,
-        )
+        self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
 
         # Update the request state for scheduling.
         request.num_computed_tokens = num_computed_tokens
@@ -998,7 +1027,7 @@ class Scheduler(SchedulerInterface):
     def _update_from_kv_xfer_finished(self,
                                       model_runner_output: ModelRunnerOutput):
         """
-        P/D: update the scheduler state based on the output.
+        KV Connector: update the scheduler state based on the output.
 
         The Worker side connectors add finished_recving and
         finished_sending reqs to the output.
@@ -1006,7 +1035,7 @@ class Scheduler(SchedulerInterface):
         # if finished_recving: add to state so we can
             scheduler the request during the next step.
         """
-        # P/D: update recv and send status from last step.
+        # KV Connector:: update recv and send status from last step.
         for req_id in (model_runner_output.finished_recving or ()):
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 3a0028a59016ebaf0149577423d07c3038165f0c..1397c5f4c9a6d6fae566f3c69e525b7921fed04f 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import Request, RequestStatus
 
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 0223c9ceec8debee761c904f11157a50cce64dfe..95222779c3afb161b23a5df3b3b5d8e3424ef931 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Callable
 
 from vllm.utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
                                         SlidingWindowSpec)
 from vllm.v1.request import Request
@@ -21,8 +22,7 @@ class SingleTypeKVCacheManager(ABC):
         self,
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
-        use_eagle: bool,
-        num_kv_cache_groups: int,
+        kv_cache_group_id: int,
         caching_hash_fn: Callable,
     ) -> None:
         """
@@ -30,9 +30,7 @@ class SingleTypeKVCacheManager(ABC):
         Args:
             kv_cache_spec: The kv_cache_spec for this manager.
             block_pool: The block pool.
-            use_eagle: Whether to use eagle.
-            num_kv_cache_groups: The number of kv cache groups managed by this 
-                manager.
+            kv_cache_group_id: The id of the kv cache group of this manager.
             caching_hash_fn: The caching hash function.
         """
 
@@ -40,9 +38,6 @@ class SingleTypeKVCacheManager(ABC):
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
-        # Needs special handling for find_longest_cache_hit if eagle is enabled
-        self.use_eagle = use_eagle
-
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
@@ -55,8 +50,8 @@ class SingleTypeKVCacheManager(ABC):
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
 
-        self.num_kv_cache_groups = num_kv_cache_groups
         self.caching_hash_fn = caching_hash_fn
+        self.kv_cache_group_id = kv_cache_group_id
 
     def get_num_blocks_to_allocate(
             self, request_id: str, num_tokens: int,
@@ -82,10 +77,10 @@ class SingleTypeKVCacheManager(ABC):
         # free queue and ref_cnt == 0), it will be changed from a free block
         # to a computed block when the request is allocated, so we also count
         # it as needed to be allocated.
-        num_evictable_computed_blocks = sum(blk.ref_cnt == 0
-                                            for blk in new_computed_blocks)
-        return ((num_new_blocks + num_evictable_computed_blocks) *
-                self.num_kv_cache_groups)
+        num_evictable_computed_blocks = sum(
+            blk.ref_cnt == 0 and not blk.is_null
+            for blk in new_computed_blocks)
+        return num_new_blocks + num_evictable_computed_blocks
 
     def save_new_computed_blocks(
             self, request_id: str,
@@ -128,12 +123,11 @@ class SingleTypeKVCacheManager(ABC):
         if num_new_blocks <= 0:
             return []
         else:
-            new_blocks = self.block_pool.get_new_blocks(
-                num_new_blocks * self.num_kv_cache_groups)
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
             return new_blocks
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHashType],
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
                      num_tokens: int) -> None:
         """
         Cache the blocks for the request.
@@ -154,12 +148,19 @@ class SingleTypeKVCacheManager(ABC):
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks,
             block_size=self.block_size,
+            kv_cache_group_id=self.kv_cache_group_id,
             hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[request.request_id] = num_full_blocks
 
     def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
         # Default to [] in case a request is freed (aborted) before alloc.
         req_blocks = self.req_to_blocks.pop(request_id, [])
 
@@ -186,12 +187,22 @@ class SingleTypeKVCacheManager(ABC):
 
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def find_longest_cache_hit(self, block_hashes: list[BlockHashType],
-                               max_length: int) -> list[KVCacheBlock]:
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
-        `max_length`. If no cache hit is found, return an empty list. 
+        `max_length`. The prefix should be a common prefix hit for all the 
+        kv cache groups in `kv_cache_group_ids`. If no cache hit is found, 
+        return an empty list. 
         If eagle is enabled, drop the last matched block to force recompute the 
         last block to get the required hidden states for eagle drafting head. 
         Need to be customized for each attention type.
@@ -199,12 +210,20 @@ class SingleTypeKVCacheManager(ABC):
         Args:
             block_hashes: The block hashes of the request.
             max_length: The maximum length of the cache hit prefix.
+            kv_cache_group_ids: The ids of the kv cache groups.
+            block_pool: The block pool.
+            kv_cache_spec: The kv cache spec.
+            use_eagle: Whether to use eagle.
 
         Returns:
-            A list of cached blocks with skipped blocks replaced by null block.
+            A list of cached blocks with skipped blocks replaced by null block
+            for each kv cache group in `kv_cache_group_ids`.
+            Return a list of length `len(kv_cache_group_ids)`, where the i-th
+            element is a list of cached blocks for the i-th kv cache group
+            in `kv_cache_group_ids`.
             For example, sliding window manager should return a list like
-            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
-            sliding window 8. 
+            ([NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]) for block size 4
+            and sliding window 8 and len(kv_cache_group_ids) = 1.
         """
 
         raise NotImplementedError
@@ -213,11 +232,9 @@ class SingleTypeKVCacheManager(ABC):
     def remove_skipped_blocks(self, request_id: str,
                               num_computed_tokens: int) -> None:
         """
-        Remove the blocks that are no longer needed from `blocks`. The removed 
-        blocks should be replaced by null_block. Return the removed blocks in 
-        eviction order, where the first returned block should be evicted first.
-        Don't free the removed blocks in this function. Need to be customized 
-        for each attention type.
+        Remove the blocks that are no longer needed from `blocks` and free the 
+        blocks. The removed blocks should be replaced by null_block.
+        Need to be customized for each attention type.
 
         Args:
             request_id: The request ID.
@@ -228,21 +245,34 @@ class SingleTypeKVCacheManager(ABC):
 
 class FullAttentionManager(SingleTypeKVCacheManager):
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHashType],
-                               max_length: int) -> list[KVCacheBlock]:
-        computed_blocks: list[KVCacheBlock] = []
-        max_num_blocks = max_length // self.block_size
-        for i in range(max_num_blocks):
-            block_hash = block_hashes[i]
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(kv_cache_spec, FullAttentionSpec), (
+            "FullAttentionManager can only be used for full attention groups")
+        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [] for _ in range(len(kv_cache_group_ids)))
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        for i, block_hash in zip(range(max_num_blocks), block_hashes):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
-            if cached_block := self.block_pool.get_cached_block(block_hash):
-                computed_blocks.append(cached_block)
+            if cached_block := block_pool.get_cached_block(
+                    block_hash, kv_cache_group_ids):
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed.append(cached)
             else:
                 break
-        if self.use_eagle and len(computed_blocks) > 0:
-            computed_blocks.pop()
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
@@ -265,45 +295,58 @@ class FullAttentionManager(SingleTypeKVCacheManager):
 class SlidingWindowManager(SingleTypeKVCacheManager):
 
     def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool,
-                 use_eagle: bool, **kwargs) -> None:
-        super().__init__(kv_cache_spec, block_pool, use_eagle, **kwargs)
+                 **kwargs) -> None:
+        super().__init__(kv_cache_spec, block_pool, **kwargs)
         self.sliding_window = kv_cache_spec.sliding_window
+        self._null_block = block_pool.null_block
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(kv_cache_spec, SlidingWindowSpec), (
+            "SlidingWindowManager can only be used for sliding window groups")
+
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
-        self.sliding_window_contiguous_blocks = cdiv(
-            (kv_cache_spec.sliding_window - 1), self.block_size)
-        if self.use_eagle:
+        sliding_window_contiguous_blocks = cdiv(
+            kv_cache_spec.sliding_window - 1, kv_cache_spec.block_size)
+        if use_eagle:
             # Need to drop the last matched block if eagle is enabled. For
             # sliding window layer, we achieve this by increasing the number of
             # contiguous blocks needed for prefix cache hit by one and dropping
             # the last matched block.
-            self.sliding_window_contiguous_blocks += 1
-        self._null_block = block_pool.null_block
+            sliding_window_contiguous_blocks += 1
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHashType],
-                               max_length: int) -> list[KVCacheBlock]:
         # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
         # optimize the time complexity from O(max_num_blocks) to
         # O(max_num_blocks / sliding_window_contiguous_blocks +
         # sliding_window_contiguous_blocks),
         # which is good for low cache hit rate scenarios.
-        max_num_blocks = max_length // self.block_size
-        computed_blocks = [self._null_block] * max_num_blocks
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        computed_blocks = tuple([block_pool.null_block] * max_num_blocks
+                                for _ in range(len(kv_cache_group_ids)))
         num_contiguous_blocks = 0
-
         match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
-            if cached_block := self.block_pool.get_cached_block(
-                    block_hashes[i]):
-                computed_blocks[i] = cached_block
+            if cached_block := block_pool.get_cached_block(
+                    block_hashes[i], kv_cache_group_ids):
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed[i] = cached
                 num_contiguous_blocks += 1
-                if (num_contiguous_blocks
-                        >= self.sliding_window_contiguous_blocks):
+                if num_contiguous_blocks >= sliding_window_contiguous_blocks:
                     # Trim the trailing blocks.
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
-                    del computed_blocks[i + num_contiguous_blocks:]
+                    for computed in computed_blocks:
+                        del computed[i + num_contiguous_blocks:]
                     match_found = True
                     break
             else:
@@ -311,9 +354,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         if not match_found:
             # The first `num_contiguous_blocks` is a cache hit even if
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
-            del computed_blocks[num_contiguous_blocks:]
-        if self.use_eagle and len(computed_blocks) > 0:
-            computed_blocks.pop()
+            for computed in computed_blocks:
+                del computed[num_contiguous_blocks:]
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 41db99beaad5e1a87ec071e50ae3162a36289593..59463f1ba99f55dea1faac775ecf3ae16a12fc00 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
@@ -44,10 +45,6 @@ class EngineCoreRequest(
         omit_defaults=True,  # type: ignore[call-arg]
         gc=False):  # type: ignore[call-arg]
 
-    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
-    # but this object is currently not playing well with msgspec
-    # due to circular imports and typing we have in data.py
-
     request_id: str
     prompt_token_ids: list[int]
     mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
@@ -58,6 +55,11 @@ class EngineCoreRequest(
     arrival_time: float
     lora_request: Optional[LoRARequest]
     cache_salt: Optional[str]
+    data_parallel_rank: Optional[int]
+
+    # Index of the client, used to ensure outputs are sent back to the same
+    # client for this request when scaling out the front-end.
+    client_index: int = 0
 
     # Used in DP case to indicate which wave of requests this is expected to
     # belong to, to cover a race condition where the request is sent before
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 74c2251c752146ba37ff9c4a966bc1a3e3c3e4a1..7fb36cf5941e8270be8e5bc6ddf61441373a6e7a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from collections.abc import AsyncGenerator, Mapping
 from copy import copy
@@ -27,7 +28,7 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient
+from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
@@ -36,6 +37,7 @@ from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory,
                                      setup_default_loggers)
+from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -54,6 +56,8 @@ class AsyncLLM(EngineClient):
         log_requests: bool = True,
         start_engine_loop: bool = True,
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ) -> None:
         """
         Create an AsyncLLM.
@@ -116,14 +120,13 @@ class AsyncLLM(EngineClient):
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        core_client_class = AsyncMPClient if (
-            vllm_config.parallel_config.data_parallel_size
-            == 1) else DPAsyncMPClient
 
-        self.engine_core = core_client_class(
+        self.engine_core = EngineCoreClient.make_async_mp_client(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
+            client_addresses=client_addresses,
+            client_index=client_index,
         )
         if self.stat_loggers:
             for stat_logger in self.stat_loggers[0]:
@@ -145,6 +148,8 @@ class AsyncLLM(EngineClient):
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
         disable_log_requests: bool = False,
         disable_log_stats: bool = False,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ) -> "AsyncLLM":
         if not envs.VLLM_USE_V1:
             raise ValueError(
@@ -162,6 +167,8 @@ class AsyncLLM(EngineClient):
             log_requests=not disable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
+            client_addresses=client_addresses,
+            client_index=client_index,
         )
 
     @classmethod
@@ -195,6 +202,8 @@ class AsyncLLM(EngineClient):
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
+        shutdown_prometheus()
+
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
@@ -212,6 +221,7 @@ class AsyncLLM(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -228,7 +238,7 @@ class AsyncLLM(EngineClient):
         prompt_str, request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             tokenization_kwargs, trace_headers, prompt_adapter_request,
-            priority)
+            priority, data_parallel_rank)
 
         if params.n == 1:
             await self._add_request(request, prompt_str, None, 0, queue)
@@ -274,6 +284,7 @@ class AsyncLLM(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -304,6 +315,7 @@ class AsyncLLM(EngineClient):
                 trace_headers=trace_headers,
                 prompt_adapter_request=prompt_adapter_request,
                 priority=priority,
+                data_parallel_rank=data_parallel_rank,
             )
 
             # The output_handler task pushes items into the queue.
@@ -320,8 +332,9 @@ class AsyncLLM(EngineClient):
                 yield out
 
         # If the request is disconnected by the client, generate()
-        # is cancelled. So, we abort the request if we end up here.
-        except asyncio.CancelledError:
+        # is cancelled or the generator is garbage collected. So,
+        # we abort the request if we end up here.
+        except (asyncio.CancelledError, GeneratorExit):
             await self.abort(request_id)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
@@ -398,7 +411,6 @@ class AsyncLLM(EngineClient):
                     # TODO(rob): make into a coroutine and launch it in
                     # background thread once Prometheus overhead is non-trivial.
                     if stat_loggers:
-                        assert outputs.scheduler_stats is not None
                         AsyncLLM._record_stats(
                             stat_loggers[outputs.engine_index],
                             scheduler_stats=outputs.scheduler_stats,
@@ -422,7 +434,7 @@ class AsyncLLM(EngineClient):
     @staticmethod
     def _record_stats(
         stat_loggers: list[StatLoggerBase],
-        scheduler_stats: SchedulerStats,
+        scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
     ):
         """static so that it can be used from the output_handler task
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6ba099c650c036a3051d97cc5afc776dc847e4
--- /dev/null
+++ b/vllm/v1/engine/coordinator.py
@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import multiprocessing
+import time
+import weakref
+from typing import Optional
+
+import msgspec.msgpack
+import zmq
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import get_mp_context, get_open_zmq_ipc_path, make_zmq_socket
+from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
+from vllm.v1.serial_utils import MsgpackDecoder
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+logger = init_logger(__name__)
+
+
+class DPCoordinator:
+    """Coordinator process used for data-parallel deployments (DP>1).
+
+    Intermediates between multiple DP engine rank processes and one or more
+    front-end API server processes.
+
+    * Collects stats from each DP engine (currently just waiting and running
+      queue lengths), and publishes these to all front-ends for use in
+      load-balancing decisions.
+
+    * Keeps track of the current DP "request wave" number and running state
+      of the engines. This is received from the DP rank 0 engine and published
+      to the front-end processes along with the current load stats.
+
+      The engines alternate between a global running/paused state. The global
+      "request wave" number is a count of the number of times that the workers
+      collectively move from a running state to a paused state. This transition
+      is synchronized via the all-reduce operation performed in the
+      DPEngineCoreProc._has_global_unfinished_reqs method.
+
+    * Broadcasts the START_DP_WAVE message to engines to move them from paused
+      to running state when one engine receives a new request. This can happen
+      in two cases:
+      1) A front-end sending a new request while the engines are paused will
+         concurrently notify the coordinator.
+      2) An engine receiving a request for a stale request wave while in paused
+         state will notify the coordinator.
+
+    Engines will move into running state when receiving a new request or
+    START_DP_WAVE message.
+    """
+
+    def __init__(self, parallel_config: ParallelConfig):
+
+        # Assume coordinator is colocated with front-end procs.
+        front_publish_address = get_open_zmq_ipc_path()
+
+        dp_size = parallel_config.data_parallel_size
+        assert dp_size > 1, "Coordinator only used for data parallel"
+
+        local_only = dp_size == parallel_config.data_parallel_size_local
+        host = parallel_config.data_parallel_master_ip
+        back_publish_address = get_engine_client_zmq_addr(local_only, host)
+        back_output_address = get_engine_client_zmq_addr(local_only, host)
+
+        context = get_mp_context()
+        self.proc: multiprocessing.Process = context.Process(
+            target=CoordinatorProc.run_coordinator,
+            name="VLLM_DP_Coordinator",
+            kwargs={
+                "engine_count": parallel_config.data_parallel_size,
+                "front_publish_address": front_publish_address,
+                "back_output_address": back_output_address,
+                "back_publish_address": back_publish_address,
+            },
+            daemon=True)
+        self.proc.start()
+
+        self.stats_publish_address = front_publish_address
+        self.coord_in_address = back_publish_address
+        self.coord_out_address = back_output_address
+        self._finalizer = weakref.finalize(self, shutdown, [self.proc])
+
+    def get_stats_publish_address(self) -> str:
+        return self.stats_publish_address
+
+    def get_engine_socket_addresses(self) -> tuple[str, str]:
+        """Returns tuple of ZMQ input address, output address."""
+        return self.coord_in_address, self.coord_out_address
+
+    def close(self):
+        self._finalizer()
+
+
+class EngineState:
+
+    def __init__(self):
+        self.request_counts = [0, 0]  # [waiting, running]
+
+
+class CoordinatorProc:
+
+    def __init__(self, engine_count: int):
+
+        self.ctx = zmq.Context()
+
+        self.engines = [EngineState() for _ in range(engine_count)]
+
+        self.current_wave = 0
+        self.engines_running = False
+        self.stats_changed = False
+
+    @staticmethod
+    def run_coordinator(
+        engine_count: int,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+    ):
+        coordinator = CoordinatorProc(engine_count=engine_count)
+        try:
+            coordinator.process_input_socket(
+                front_publish_address,
+                back_output_address,
+                back_publish_address,
+            )
+        except KeyboardInterrupt:
+            logger.info("DP Coordinator process exiting")
+
+    def process_input_socket(self, front_publish_address: str,
+                             back_output_address: str,
+                             back_publish_address: str):
+
+        decoder = MsgpackDecoder(EngineCoreOutputs)
+
+        with make_zmq_socket(
+                path=front_publish_address,  # IPC
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_front, make_zmq_socket(
+                path=back_output_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.PULL,
+                bind=True,
+        ) as output_back, make_zmq_socket(
+                path=back_publish_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_back:
+
+            poller = zmq.Poller()
+            poller.register(publish_front, zmq.POLLIN)
+            poller.register(output_back, zmq.POLLIN)
+            last_publish_time = 0
+            while True:
+                elapsed = int(time.time() * 1000) - last_publish_time
+                # Send at 100 ms interval if the stats have changed,
+                # or otherwise every 3 seconds.
+                wait_for = 100 if self.stats_changed else 3000
+                events = poller.poll(timeout=max(0, wait_for - elapsed))
+                if not events:
+                    # Poller timeout - publish current stats to front-ends.
+                    engine_req_counts_list = self._get_engine_counts()
+                    to_publish = (engine_req_counts_list, self.current_wave,
+                                  self.engines_running)
+                    publish_front.send(msgspec.msgpack.encode(to_publish))
+                    last_publish_time = int(time.time() * 1000)
+                    self.stats_changed = False
+                    continue
+
+                events = dict(events)
+
+                if publish_front in events:
+                    buffer = publish_front.recv()
+                    if buffer == b'\x01':
+                        # Ignore subscription messages.
+                        continue
+
+                    # We received a message on the front-end XPUB socket,
+                    # from an API server sending a new request while the
+                    # engines are paused, so that we can wake the other
+                    # engines.
+                    engine_to_exclude, wave = msgspec.msgpack.decode(buffer)
+                    if wave < self.current_wave:
+                        # If the wave number is stale, ensure the message is
+                        # handled by all the engines.
+                        engine_to_exclude = None
+                    if not self.engines_running:
+                        self.engines_running = True
+                        self.stats_changed = True
+                        self._send_start_wave(publish_back, self.current_wave,
+                                              engine_to_exclude)
+
+                if output_back in events:
+                    # We received a message from one of the engines.
+
+                    buffer = output_back.recv()
+                    outputs: EngineCoreOutputs = decoder.decode(buffer)
+
+                    assert not outputs.outputs
+                    assert outputs.utility_output is None
+
+                    eng_index = outputs.engine_index
+                    if outputs.scheduler_stats:
+                        # 1. Updated request load stats - update our local
+                        # state with these.
+                        stats = self.engines[eng_index].request_counts
+                        stats[0] = outputs.scheduler_stats.num_waiting_reqs
+                        stats[1] = outputs.scheduler_stats.num_running_reqs
+                        self.stats_changed = True
+
+                    if (wave := outputs.wave_complete) is not None:
+                        # 2. Notification from rank 0 engine that we've
+                        # moved into the global paused state
+                        # (engines_running==False)
+                        if self.current_wave <= wave:
+                            logger.debug("Moving DP wave from %d to %d.",
+                                         self.current_wave, wave)
+                            self.current_wave = wave + 1
+                            self.engines_running = False
+                            self.stats_changed = True
+                    elif (wave := outputs.start_wave) is not None and (
+                            wave > self.current_wave or
+                        (wave == self.current_wave
+                         and not self.engines_running)):
+                        # 3. The engine received request for a non-current wave
+                        # so we must ensure that other engines progress to the
+                        # next wave (race condition handling).
+                        logger.debug(
+                            "Starting wave %d after notification of "
+                            "stale wave request from engine.", wave)
+                        self.current_wave = wave
+                        self.engines_running = True
+                        self.stats_changed = True
+                        self._send_start_wave(publish_back, wave, eng_index)
+
+    @staticmethod
+    def _send_start_wave(socket: zmq.Socket, wave: int,
+                         exclude_engine_index: Optional[int]):
+        """Broadcast the START_DP_WAVE message to all the engines.
+        It includes the current wave number and index of engine which
+        has already received a request with this wave number and so doesn't
+        require additional notification.
+        """
+        wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index))
+        socket.send_multipart(
+            (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
+
+    def _get_engine_counts(self) -> list[list[int]]:
+        """Return list of [waiting, running] count lists for each engine."""
+        return [e.request_counts for e in self.engines]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9cd4d106cfed6cd9f5d8369f5959dcadb1caf2ba..0202dc4f8a1aa9398549d8b4f5a58492db2eeb5d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import queue
 import signal
@@ -6,7 +7,9 @@ import sys
 import threading
 import time
 from collections import deque
+from collections.abc import Generator
 from concurrent.futures import Future
+from contextlib import ExitStack, contextmanager
 from inspect import isclass, signature
 from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union
@@ -22,7 +25,7 @@ from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import make_zmq_socket, resolve_obj_by_qualname, zmq_socket_ctx
+from vllm.utils import make_zmq_socket, resolve_obj_by_qualname
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -33,10 +36,12 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -123,7 +128,6 @@ class EngineCore:
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
-        self.vllm_config = vllm_config
 
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -212,24 +216,27 @@ class EngineCore:
             # Re-raise exception
             raise err
 
-    def step(self) -> EngineCoreOutputs:
-        """Schedule, execute, and make output."""
+    def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
+        """Schedule, execute, and make output.
+
+        Returns tuple of outputs and a flag indicating whether the model
+        was executed.
+        """
 
         # Check for any requests remaining in the scheduler - unfinished,
         # or finished and not yet removed from the batch.
         if not self.scheduler.has_requests():
-            return EngineCoreOutputs(
-                outputs=[],
-                scheduler_stats=self.scheduler.make_stats(),
-            )
+            return {}, False
         scheduler_output = self.scheduler.schedule()
         model_output = self.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output)  # type: ignore
 
-        return engine_core_outputs
+        return (engine_core_outputs,
+                scheduler_output.total_num_scheduled_tokens > 0)
 
-    def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
+    def step_with_batch_queue(
+            self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
         """Schedule and execute batches with the batch queue.
         Note that if nothing to output in this step, None is returned.
 
@@ -271,10 +278,10 @@ class EngineCore:
             # Blocking until the first result is available.
             model_output = future.result()
             self.batch_queue.task_done()
-            engine_core_outputs = self.scheduler.update_from_output(
-                scheduler_output, model_output)
+            engine_core_outputs = (self.scheduler.update_from_output(
+                scheduler_output, model_output))
 
-        return engine_core_outputs
+        return engine_core_outputs, scheduled_batch
 
     def shutdown(self):
         self.structured_output_manager.clear_backend()
@@ -357,78 +364,88 @@ class EngineCoreProc(EngineCore):
         self,
         vllm_config: VllmConfig,
         on_head_node: bool,
-        input_address: str,
+        handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
         engine_index: int = 0,
     ):
-        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
-
-        executor_fail_callback = lambda: input_queue.put_nowait(
+        self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
+                                              bytes]]()
+        executor_fail_callback = lambda: self.input_queue.put_nowait(
             (EngineCoreRequestType.EXECUTOR_FAILED, b''))
 
-        # Create input socket.
-        input_ctx = zmq.Context()
-        identity = engine_index.to_bytes(length=2, byteorder="little")
-        input_socket = make_zmq_socket(input_ctx,
-                                       input_address,
-                                       zmq.DEALER,
-                                       identity=identity,
-                                       bind=False)
-        try:
-            # Register engine with front-end.
-            output_address = self.startup_handshake(
-                input_socket, on_head_node, vllm_config.parallel_config)
+        self.engine_index = engine_index
+        identity = self.engine_index.to_bytes(length=2, byteorder="little")
+        self.engines_running = False
 
-            # Update config which may have changed from the handshake.
-            vllm_config.__post_init__()
+        with self._perform_handshake(handshake_address, identity, on_head_node,
+                                     vllm_config) as addresses:
+            self.client_count = len(addresses.outputs)
 
             # Set up data parallel environment.
+            self.has_coordinator = addresses.coordinator_output is not None
             self._init_data_parallel(vllm_config)
 
-            # Initialize engine core and model.
             super().__init__(vllm_config, executor_class, log_stats,
                              executor_fail_callback)
 
-            self.step_fn = (self.step if self.batch_queue is None else
-                            self.step_with_batch_queue)
-            self.engines_running = False
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
+        # Background Threads and Queues for IO. These enable us to
+        # overlap ZMQ socket IO with GPU since they release the GIL,
+        # and to overlap some serialization/deserialization with the
+        # model forward pass.
+        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        threading.Thread(target=self.process_input_sockets,
+                         args=(addresses.inputs, addresses.coordinator_input,
+                               identity),
+                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_sockets,
+            args=(addresses.outputs, addresses.coordinator_output,
+                  self.engine_index),
+            daemon=True)
+        self.output_thread.start()
+
+    @contextmanager
+    def _perform_handshake(
+            self, handshake_address: str, identity: bytes, on_head_node: bool,
+            vllm_config: VllmConfig
+    ) -> Generator[EngineZmqAddresses, None, None]:
+        input_ctx = zmq.Context()
+        with make_zmq_socket(input_ctx,
+                             handshake_address,
+                             zmq.DEALER,
+                             identity=identity,
+                             linger=5000,
+                             bind=False) as handshake_socket:
+            # Register engine with front-end.
+            addresses = self.startup_handshake(handshake_socket, on_head_node,
+                                               vllm_config.parallel_config)
+
+            # Update config which may have changed from the handshake
+            vllm_config.__post_init__()
+
+            yield addresses
 
             # Send ready message.
             num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
-            input_socket.send(
+            handshake_socket.send(
                 msgspec.msgpack.encode({
                     "status": "READY",
                     "local": on_head_node,
                     "num_gpu_blocks": num_gpu_blocks,
                 }))
 
-            # Background Threads and Queues for IO. These enable us to
-            # overlap ZMQ socket IO with GPU since they release the GIL,
-            # and to overlap some serialization/deserialization with the
-            # model forward pass.
-            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-            self.input_queue = input_queue
-            self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
-            threading.Thread(target=self.process_input_socket,
-                             args=(input_socket, ),
-                             daemon=True).start()
-            input_socket = None
-            self.output_thread = threading.Thread(
-                target=self.process_output_socket,
-                args=(output_address, engine_index),
-                daemon=True)
-            self.output_thread.start()
-        finally:
-            if input_socket is not None:
-                input_socket.close(linger=0)
-
     @staticmethod
-    def startup_handshake(input_socket: zmq.Socket, on_head_node: bool,
-                          parallel_config: ParallelConfig) -> str:
+    def startup_handshake(
+            handshake_socket: zmq.Socket, on_head_node: bool,
+            parallel_config: ParallelConfig) -> EngineZmqAddresses:
 
         # Send registration message.
-        input_socket.send(
+        handshake_socket.send(
             msgspec.msgpack.encode({
                 "status": "HELLO",
                 "local": on_head_node,
@@ -436,22 +453,20 @@ class EngineCoreProc(EngineCore):
 
         # Receive initialization message.
         logger.info("Waiting for init message from front-end.")
-        if not input_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60 * 1000):
+        if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000):
             raise RuntimeError("Did not receive response from front-end "
                                f"process within {HANDSHAKE_TIMEOUT_MINS} "
                                f"minutes")
-        init_bytes = input_socket.recv()
-        init_message = msgspec.msgpack.decode(init_bytes)
+        init_bytes = handshake_socket.recv()
+        init_message: EngineHandshakeMetadata = msgspec.msgpack.decode(
+            init_bytes, type=EngineHandshakeMetadata)
         logger.debug("Received init message: %s", init_message)
 
-        output_socket_address = init_message["output_socket_address"]
-        #TBD(nick) maybe replace IP with configured head node address
-
-        received_parallel_config = init_message["parallel_config"]
+        received_parallel_config = init_message.parallel_config
         for key, value in received_parallel_config.items():
             setattr(parallel_config, key, value)
 
-        return output_socket_address
+        return init_message.addresses
 
         self.global_unfinished_reqs = False
 
@@ -532,7 +547,7 @@ class EngineCoreProc(EngineCore):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.engines_running and not (self.scheduler.has_requests()):
+        while not self.engines_running and not self.scheduler.has_requests():
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
@@ -547,14 +562,16 @@ class EngineCoreProc(EngineCore):
             req = self.input_queue.get_nowait()
             self._handle_client_request(*req)
 
-    def _process_engine_step(self):
+    def _process_engine_step(self) -> bool:
         """Called only when there are unfinished local requests."""
 
         # Step the engine core.
-        outputs = self.step_fn()
+        outputs, model_executed = self.step_fn()
         # Put EngineCoreOutputs into the output queue.
-        if outputs is not None:
-            self.output_queue.put_nowait(outputs)
+        for output in (outputs.items() if outputs else ()):
+            self.output_queue.put_nowait(output)
+
+        return model_executed
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
@@ -565,7 +582,7 @@ class EngineCoreProc(EngineCore):
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
         elif request_type == EngineCoreRequestType.UTILITY:
-            call_id, method_name, args = request
+            client_idx, call_id, method_name, args = request
             output = UtilityOutput(call_id)
             try:
                 method = getattr(self, method_name)
@@ -576,7 +593,7 @@ class EngineCoreProc(EngineCore):
                 output.failure_message = (f"Call to {method_name} method"
                                           f" failed: {str(e)}")
             self.output_queue.put_nowait(
-                EngineCoreOutputs(utility_output=output))
+                (client_idx, EngineCoreOutputs(utility_output=output)))
         elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
             raise RuntimeError("Executor failed.")
         else:
@@ -609,27 +626,68 @@ class EngineCoreProc(EngineCore):
             logger.fatal("vLLM shutdown signal from EngineCore failed "
                          "to send. Please report this issue.")
 
-    def process_input_socket(self, input_socket: zmq.Socket):
+    def process_input_sockets(self, input_addresses: list[str],
+                              coord_input_address: Optional[str],
+                              identity: bytes):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
         generic_decoder = MsgpackDecoder()
 
-        while True:
-            # (RequestType, RequestData)
-            type_frame, *data_frames = input_socket.recv_multipart(copy=False)
-            request_type = EngineCoreRequestType(bytes(type_frame.buffer))
-
-            # Deserialize the request data.
-            decoder = add_request_decoder if (
-                request_type == EngineCoreRequestType.ADD) else generic_decoder
-            request = decoder.decode(data_frames)
-
-            # Push to input queue for core busy loop.
-            self.input_queue.put_nowait((request_type, request))
+        with ExitStack() as stack, zmq.Context() as ctx:
+            input_sockets = [
+                stack.enter_context(
+                    make_zmq_socket(ctx,
+                                    input_address,
+                                    zmq.DEALER,
+                                    identity=identity,
+                                    bind=False))
+                for input_address in input_addresses
+            ]
+            if coord_input_address is None:
+                coord_socket = None
+            else:
+                coord_socket = stack.enter_context(
+                    make_zmq_socket(ctx,
+                                    coord_input_address,
+                                    zmq.XSUB,
+                                    identity=identity,
+                                    bind=False))
+                # Send subscription message to coordinator.
+                coord_socket.send(b'\x01')
+
+            # Register sockets with poller.
+            poller = zmq.Poller()
+            for input_socket in input_sockets:
+                # Send initial message to each input socket - this is required
+                # before the front-end ROUTER socket can send input messages
+                # back to us.
+                input_socket.send(b'')
+                poller.register(input_socket, zmq.POLLIN)
+            if coord_socket is not None:
+                poller.register(coord_socket, zmq.POLLIN)
 
-    def process_output_socket(self, output_path: str, engine_index: int):
+            while True:
+                for input_socket, _ in poller.poll():
+                    # (RequestType, RequestData)
+                    type_frame, *data_frames = input_socket.recv_multipart(
+                        copy=False)
+                    request_type = EngineCoreRequestType(
+                        bytes(type_frame.buffer))
+
+                    # Deserialize the request data.
+                    decoder = add_request_decoder if (
+                        request_type
+                        == EngineCoreRequestType.ADD) else generic_decoder
+                    request = decoder.decode(data_frames)
+
+                    # Push to input queue for core busy loop.
+                    self.input_queue.put_nowait((request_type, request))
+
+    def process_output_sockets(self, output_paths: list[str],
+                               coord_output_path: Optional[str],
+                               engine_index: int):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
@@ -643,30 +701,49 @@ class EngineCoreProc(EngineCore):
 
         # We must set linger to ensure the ENGINE_CORE_DEAD
         # message is sent prior to closing the socket.
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
-                            linger=4000) as socket:
+        with ExitStack() as stack, zmq.Context() as ctx:
+            sockets = [
+                stack.enter_context(
+                    make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000))
+                for output_path in output_paths
+            ]
+            coord_socket = stack.enter_context(
+                make_zmq_socket(
+                    ctx, coord_output_path, zmq.PUSH, bind=False,
+                    linger=4000)) if coord_output_path is not None else None
+            max_reuse_bufs = len(sockets) + 1
+
             while True:
-                outputs = self.output_queue.get()
-                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
-                    socket.send(outputs, copy=False)
+                output = self.output_queue.get()
+                if output == EngineCoreProc.ENGINE_CORE_DEAD:
+                    for socket in sockets:
+                        socket.send(output)
                     break
-                assert not isinstance(outputs, bytes)
+                assert not isinstance(output, bytes)
+                client_index, outputs = output
                 outputs.engine_index = engine_index
 
+                if client_index == -1:
+                    # Don't reuse buffer for coordinator message
+                    # which will be very small.
+                    assert coord_socket is not None
+                    coord_socket.send_multipart(encoder.encode(outputs))
+                    continue
+
                 # Reclaim buffers that zmq is finished with.
                 while pending and pending[-1][0].done:
                     reuse_buffers.append(pending.pop()[2])
 
                 buffer = reuse_buffers.pop() if reuse_buffers else bytearray()
                 buffers = encoder.encode_into(outputs, buffer)
-                tracker = socket.send_multipart(buffers,
-                                                copy=False,
-                                                track=True)
+                tracker = sockets[client_index].send_multipart(buffers,
+                                                               copy=False,
+                                                               track=True)
                 if not tracker.done:
                     ref = outputs if len(buffers) > 1 else None
                     pending.appendleft((tracker, ref, buffer))
-                elif len(reuse_buffers) < 2:
-                    # Keep at most 2 buffers to reuse.
+                elif len(reuse_buffers) < max_reuse_bufs:
+                    # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
 
@@ -678,27 +755,33 @@ class DPEngineCoreProc(EngineCoreProc):
         self,
         vllm_config: VllmConfig,
         on_head_node: bool,
-        input_address: str,
+        handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
     ):
-        # Add process-specific prefix to stdout and stderr before
-        # we initialize the engine.
-        from multiprocessing import current_process
-        process_name = current_process().name
-        pid = os.getpid()
-        _add_prefix(sys.stdout, process_name, pid)
-        _add_prefix(sys.stderr, process_name, pid)
+
+        self._decorate_logs()
 
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
         self.counter = 0
+        self.current_wave = 0
+        self.last_counts = (0, 0)
 
         # Initialize the engine.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
-        super().__init__(vllm_config, on_head_node, input_address,
+        super().__init__(vllm_config, on_head_node, handshake_address,
                          executor_class, log_stats, dp_rank)
 
+    def _decorate_logs(self):
+        # Add process-specific prefix to stdout and stderr before
+        # we initialize the engine.
+        from multiprocessing import current_process
+        process_name = current_process().name
+        pid = os.getpid()
+        _add_prefix(sys.stdout, process_name, pid)
+        _add_prefix(sys.stderr, process_name, pid)
+
     def _init_data_parallel(self, vllm_config: VllmConfig):
 
         # Configure GPUs and stateless process group for data parallel.
@@ -709,6 +792,15 @@ class DPEngineCoreProc(EngineCoreProc):
         assert dp_size > 1
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
+        if vllm_config.kv_transfer_config is not None:
+            # modify the engine_id and append the local_dp_rank to it to ensure
+            # that the kv_transfer_config is unique for each DP rank.
+            vllm_config.kv_transfer_config.engine_id = (
+                f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
+            )
+            logger.debug("Setting kv_transfer_config.engine_id to %s",
+                         vllm_config.kv_transfer_config.engine_id)
+
         from vllm.platforms import current_platform
         device_control_env_var = current_platform.device_control_env_var
         world_size = vllm_config.parallel_config.world_size
@@ -719,7 +811,6 @@ class DPEngineCoreProc(EngineCoreProc):
 
         self.dp_rank = dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
-        self.current_wave = 0
 
     def shutdown(self):
         super().shutdown()
@@ -727,22 +818,23 @@ class DPEngineCoreProc(EngineCoreProc):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
     def add_request(self, request: EngineCoreRequest):
-        if request.current_wave != self.current_wave:
+        if self.has_coordinator and request.current_wave != self.current_wave:
             if request.current_wave > self.current_wave:
                 self.current_wave = request.current_wave
             elif not self.engines_running:
                 # Request received for an already-completed wave, notify
                 # front-end that we need to start the next one.
                 self.output_queue.put_nowait(
-                    EngineCoreOutputs(start_wave=self.current_wave))
+                    (-1, EngineCoreOutputs(start_wave=self.current_wave)))
 
         super().add_request(request)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
         if request_type == EngineCoreRequestType.START_DP_WAVE:
-            new_wave: int = request
-            if new_wave >= self.current_wave:
+            new_wave, exclude_eng_index = request
+            if exclude_eng_index != self.engine_index and (
+                    new_wave >= self.current_wave):
                 self.current_wave = new_wave
                 if not self.engines_running:
                     logger.debug("EngineCore starting idle loop for wave %d.",
@@ -751,6 +843,18 @@ class DPEngineCoreProc(EngineCoreProc):
         else:
             super()._handle_client_request(request_type, request)
 
+    def _maybe_publish_request_counts(self):
+        if not self.has_coordinator:
+            return
+
+        # Publish our request counts (if they've changed).
+        counts = self.scheduler.get_request_counts()
+        if counts != self.last_counts:
+            self.last_counts = counts
+            stats = SchedulerStats(*counts)
+            self.output_queue.put_nowait(
+                (-1, EngineCoreOutputs(scheduler_stats=stats)))
+
     def run_busy_loop(self):
         """Core busy loop of the EngineCore for data parallel case."""
 
@@ -759,30 +863,18 @@ class DPEngineCoreProc(EngineCoreProc):
             # 1) Poll the input queue until there is work to do.
             self._process_input_queue()
 
-            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
-
-            if local_unfinished_reqs:
-                # 2) Step the engine core.
-                self._process_engine_step()
-
-                # Check if we have now finished all requests.
-                local_unfinished_reqs = (
-                    self.scheduler.has_unfinished_requests())
-            else:
-                if self.scheduler.has_finished_requests():
-                    # There are no unfinished requests, but there are some
-                    # finished requests remaining to be removed from the
-                    # batch state. This engine step won't perform a forward
-                    # pass but will flush the finished requests to ensure
-                    # up-to-date state is returned in the engine outputs.
-                    self._process_engine_step()
+            # 2) Step the engine core.
+            executed = self._process_engine_step()
+            self._maybe_publish_request_counts()
 
-                if not self.engines_running:
+            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
+            if not executed:
+                if not local_unfinished_reqs and not self.engines_running:
                     # All engines are idle.
                     continue
 
-                # There must be unfinished requests in DP peers, run a
-                # dummy forward pass.
+                # We are in a running state and so must execute a dummy pass
+                # if the model didn't execute any ready requests.
                 self.execute_dummy_batch()
 
             # 3) All-reduce operation to determine global unfinished reqs.
@@ -795,7 +887,8 @@ class DPEngineCoreProc(EngineCoreProc):
                     logger.debug("Wave %d finished, pausing engine loop.",
                                  self.current_wave)
                     self.output_queue.put_nowait(
-                        EngineCoreOutputs(wave_complete=self.current_wave))
+                        (-1,
+                         EngineCoreOutputs(wave_complete=self.current_wave)))
                 self.current_wave += 1
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
@@ -808,3 +901,70 @@ class DPEngineCoreProc(EngineCoreProc):
 
         return ParallelConfig.has_unfinished_dp(self.dp_group,
                                                 local_unfinished)
+
+
+class DPEngineCoreActor(DPEngineCoreProc):
+    """
+    Ray actor for running EngineCore in a data parallel context
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        on_head_node: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        self.addresses = addresses
+        vllm_config.parallel_config.data_parallel_rank = dp_rank
+        vllm_config.parallel_config.data_parallel_rank_local = \
+            local_dp_rank
+
+        # Ray sets CUDA_VISIBLE_DEVICES to empty string,
+        # we clean this up to be able to properly initialize
+        # data parallel groups.
+        del os.environ['CUDA_VISIBLE_DEVICES']
+
+        super().__init__(vllm_config, on_head_node, "", executor_class,
+                         log_stats)
+
+    def _decorate_logs(self):
+        pass
+
+    @contextmanager
+    def _perform_handshake(self, handshake_address: str, identity: bytes,
+                           on_head_node: bool, vllm_config: VllmConfig):
+        """
+        For Ray, we don't need to actually perform handshake.
+        All addresses information is known before the actor creation.
+        Therefore, we simply yield these addresses.
+        """
+        yield self.addresses
+
+    def wait_for_init(self):
+        """
+        Wait until the engine core is initialized.
+
+        This is just an empty method. When ray.get() on this method
+        (or any other method of the actor) returns, it is guaranteed
+        that actor creation (i.e., __init__) is complete.
+        """
+        pass
+
+    def run(self):
+        """
+        Run the engine core busy loop.
+        """
+        try:
+            self.run_busy_loop()
+        except SystemExit:
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception:
+            logger.exception("EngineCore encountered a fatal error.")
+            raise
+        finally:
+            self.shutdown()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c04cb7d54249d37a4e9dbb72e597bb4ef545ca43..48b6da17bb0ffafb1fd9b9e3e95ad69349d94b3a 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import contextlib
 import queue
+import sys
 import uuid
 import weakref
 from abc import ABC, abstractmethod
@@ -10,26 +12,28 @@ from collections import deque
 from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass
-from enum import Enum, auto
 from threading import Thread
 from typing import Any, Callable, Optional, TypeVar, Union
 
-import msgspec
+import msgspec.msgpack
 import zmq
 import zmq.asyncio
 
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.utils import (get_open_port, get_open_zmq_inproc_path,
-                        get_open_zmq_ipc_path, get_tcp_uri, make_zmq_socket)
+from vllm.utils import (get_open_zmq_inproc_path, make_zmq_socket,
+                        zmq_socket_ctx)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
+from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
-from vllm.v1.utils import CoreEngineProcManager
+from vllm.v1.utils import (CoreEngine, CoreEngineActorManager,
+                           CoreEngineProcManager, EngineZmqAddresses,
+                           get_engine_client_zmq_addr, wait_for_engine_startup)
 
 logger = init_logger(__name__)
 
@@ -65,16 +69,31 @@ class EngineCoreClient(ABC):
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            if vllm_config.parallel_config.data_parallel_size > 1:
-                return DPAsyncMPClient(vllm_config, executor_class, log_stats)
-
-            return AsyncMPClient(vllm_config, executor_class, log_stats)
+            return EngineCoreClient.make_async_mp_client(
+                vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
             return SyncMPClient(vllm_config, executor_class, log_stats)
 
         return InprocClient(vllm_config, executor_class, log_stats)
 
+    @staticmethod
+    def make_async_mp_client(
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ) -> "MPClient":
+        if vllm_config.parallel_config.data_parallel_size > 1:
+            if vllm_config.parallel_config.data_parallel_backend == "ray":
+                return RayDPClient(vllm_config, executor_class, log_stats,
+                                   client_addresses, client_index)
+            return DPAsyncMPClient(vllm_config, executor_class, log_stats,
+                                   client_addresses, client_index)
+        return AsyncMPClient(vllm_config, executor_class, log_stats,
+                             client_addresses, client_index)
+
     @abstractmethod
     def shutdown(self):
         ...
@@ -205,7 +224,8 @@ class InprocClient(EngineCoreClient):
         self.engine_core = EngineCore(*args, **kwargs)
 
     def get_output(self) -> EngineCoreOutputs:
-        return self.engine_core.step()
+        outputs, _ = self.engine_core.step()
+        return outputs.get(0) or EngineCoreOutputs()
 
     def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
@@ -264,34 +284,22 @@ class InprocClient(EngineCoreClient):
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
 
-class CoreEngineState(Enum):
-    NEW = auto()
-    CONNECTED = auto()
-    READY = auto()
-
-
-class CoreEngine:
-    """One per data parallel rank."""
-
-    def __init__(self, index: int = 0, local: bool = True):
-        self.local = local
-        self.index = index
-        self.identity = index.to_bytes(length=2, byteorder="little")
-
-        self.state = CoreEngineState.NEW
-        self.num_reqs_in_flight = 0
-
-
 @dataclass
 class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
     ctx: Union[zmq.Context]
-    local_engine_manager: Optional[CoreEngineProcManager] = None
+    # If CoreEngineProcManager, it manages local engines;
+    # if CoreEngineActorManager, it manages all engines.
+    engine_manager: Optional[Union[CoreEngineProcManager,
+                                   CoreEngineActorManager]] = None
+    coordinator: Optional[DPCoordinator] = None
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    first_req_send_socket: Optional[zmq.asyncio.Socket] = None
     output_queue_task: Optional[asyncio.Task] = None
+    stats_update_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
 
     # Set if any of the engines are dead. Here so that the output
@@ -302,16 +310,23 @@ class BackgroundResources:
         """Clean up background resources."""
 
         self.engine_dead = True
-        if self.local_engine_manager is not None:
-            self.local_engine_manager.close()
+        if self.engine_manager is not None:
+            self.engine_manager.close()
+        if self.coordinator is not None:
+            self.coordinator.close()
 
         if self.output_queue_task is not None:
             self.output_queue_task.cancel()
+        if self.stats_update_task is not None:
+            self.stats_update_task.cancel()
 
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
-        if self.output_socket is not None:
-            self.output_socket.close(linger=0)
+        for socket in (self.output_socket, self.input_socket,
+                       self.first_req_send_socket):
+            if socket is not None:
+                socket.close(linger=0)
+
         if self.shutdown_path is not None:
             # We must ensure that the sync output socket is
             # closed cleanly in its own thread.
@@ -346,6 +361,7 @@ class MPClient(EngineCoreClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
     ):
         self.vllm_config = vllm_config
         # Serialization setup.
@@ -365,8 +381,9 @@ class MPClient(EngineCoreClient):
         try:
             parallel_config = vllm_config.parallel_config
             local_engine_count = parallel_config.data_parallel_size_local
-            start_index = parallel_config.data_parallel_rank
             local_start_index = parallel_config.data_parallel_rank_local
+            dp_size = parallel_config.data_parallel_size
+            dp_rank = parallel_config.data_parallel_rank
 
             # SPMD mode is where there is an LLM instance per DP rank and
             # one core engine per LLM, see
@@ -374,46 +391,55 @@ class MPClient(EngineCoreClient):
             spmd_mode = local_start_index is not None
             if spmd_mode:
                 assert local_engine_count == 1
-                self.core_engines = [
-                    CoreEngine(index=local_start_index, local=True)
-                ]
+                self.core_engines = [CoreEngine(index=dp_rank, local=True)]
             else:
-                assert start_index == 0
+                assert dp_rank == 0
                 local_start_index = 0
                 self.core_engines = [
                     CoreEngine(index=i, local=(i < local_engine_count))
-                    for i in range(parallel_config.data_parallel_size)
+                    for i in range(dp_size)
                 ]
 
-            input_address, output_address = self._get_zmq_addresses(
-                parallel_config, spmd_mode)
+            local_only = spmd_mode or local_engine_count == dp_size
+
+            self.stats_update_address: Optional[str] = None
+            if client_addresses is not None:
+                input_address = client_addresses["input_address"]
+                output_address = client_addresses["output_address"]
+                self.stats_update_address = client_addresses.get(
+                    "stats_update_address")
+            else:
+                host = parallel_config.data_parallel_master_ip
+                input_address = get_engine_client_zmq_addr(local_only, host)
+                output_address = get_engine_client_zmq_addr(local_only, host)
 
             # Create input and output sockets.
             self.input_socket = self.resources.input_socket = make_zmq_socket(
                 self.ctx, input_address, zmq.ROUTER, bind=True)
-
             self.resources.output_socket = make_zmq_socket(
-                self.ctx, output_address, zmq.constants.PULL)
-            # Start local engines.
-            if local_engine_count:
-                # In server mode, start_index and local_start_index will
-                # both be 0.
-                self.resources.local_engine_manager = CoreEngineProcManager(
-                    EngineCoreProc.run_engine_core,
-                    vllm_config=vllm_config,
-                    executor_class=executor_class,
-                    log_stats=log_stats,
-                    input_address=input_address,
-                    on_head_node=True,
-                    local_engine_count=local_engine_count,
-                    start_index=start_index,
-                    local_start_index=local_start_index)
+                self.ctx, output_address, zmq.PULL)
+
+            if client_addresses is None:
+                self._init_engines_direct(vllm_config, local_only,
+                                          local_start_index, input_address,
+                                          output_address, executor_class,
+                                          log_stats)
+                coordinator = self.resources.coordinator
+                if coordinator:
+                    self.stats_update_address = (
+                        coordinator.get_stats_publish_address())
+
+            # Wait for ready messages from each engine on the input socket.
+            identities = set(e.identity for e in self.core_engines)
+            sync_input_socket = zmq.Socket.shadow(self.input_socket)
+            while identities:
+                if not sync_input_socket.poll(timeout=600_000):
+                    raise TimeoutError("Timed out waiting for engines to send"
+                                       "initial message on input socket.")
+                identity, _ = sync_input_socket.recv_multipart()
+                identities.remove(identity)
 
             self.core_engine = self.core_engines[0]
-
-            # Wait for engine core process(es) to start.
-            self._wait_for_engine_startup(output_address, parallel_config)
-
             self.utility_results: dict[int, AnyFuture] = {}
 
             # Request objects which may contain pytorch-allocated tensors
@@ -426,116 +452,72 @@ class MPClient(EngineCoreClient):
             if not success:
                 self._finalizer()
 
-    @staticmethod
-    def _get_zmq_addresses(parallel_config: ParallelConfig,
-                           spmd_mode: bool) -> tuple[str, str]:
-        """Returns (input_address, output_address)."""
-        dp_size = parallel_config.data_parallel_size
+    def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool,
+                             local_start_index: int, input_address: str,
+                             output_address: str,
+                             executor_class: type[Executor], log_stats: bool):
+        """Self-contained client mode, launch engine and coordinator process
+        as needed."""
+
+        parallel_config = vllm_config.parallel_config
         local_engine_count = parallel_config.data_parallel_size_local
+        start_index = parallel_config.data_parallel_rank
+        host = parallel_config.data_parallel_master_ip
 
-        if local_engine_count == dp_size or spmd_mode:
-            input_address = get_open_zmq_ipc_path()
-            output_address = get_open_zmq_ipc_path()
-        else:
-            host = parallel_config.data_parallel_master_ip
-            input_port = parallel_config.data_parallel_rpc_port
-            output_port = get_open_port()
-            input_address = get_tcp_uri(host, input_port)
-            output_address = get_tcp_uri(host, output_port)
-
-        return input_address, output_address
-
-    def _wait_for_engine_startup(self, output_address: str,
-                                 parallel_config: ParallelConfig):
-        # Get a sync handle to the socket which can be sync or async.
-        sync_input_socket = zmq.Socket.shadow(self.input_socket)
-
-        # Wait for engine core process(es) to send ready messages.
-        local_count = parallel_config.data_parallel_size_local
-        remote_count = len(self.core_engines) - local_count
-        # [local, remote] counts
-        conn_pending, start_pending = [local_count, remote_count], [0, 0]
-
-        poller = zmq.Poller()
-        poller.register(sync_input_socket, zmq.POLLIN)
-        proc_manager = self.resources.local_engine_manager
-        if proc_manager is not None:
-            for sentinel in proc_manager.sentinels():
-                poller.register(sentinel, zmq.POLLIN)
-        while any(conn_pending) or any(start_pending):
-            events = poller.poll(STARTUP_POLL_PERIOD_MS)
-            if not events:
-                if any(conn_pending):
-                    logger.debug(
-                        "Waiting for %d local, %d remote core engine proc(s) "
-                        "to connect.", *conn_pending)
-                if any(start_pending):
-                    logger.debug(
-                        "Waiting for %d local, %d remote core engine proc(s) "
-                        "to start.", *start_pending)
-                continue
-            if len(events) > 1 or events[0][0] != sync_input_socket:
-                # One of the local core processes exited.
-                finished = proc_manager.finished_procs(
-                ) if proc_manager else {}
-                raise RuntimeError("Engine core initialization failed. "
-                                   "See root cause above. "
-                                   f"Failed core proc(s): {finished}")
-
-            # Receive HELLO and READY messages from the input socket.
-            eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart()
-            eng_index = int.from_bytes(eng_identity, byteorder="little")
-            engine = next(
-                (e for e in self.core_engines if e.identity == eng_identity),
-                None)
-            if engine is None:
-                raise RuntimeError(f"Message from engine with unexpected data "
-                                   f"parallel rank: {eng_index}")
-            msg = msgspec.msgpack.decode(ready_msg_bytes)
-            status, local = msg["status"], msg["local"]
-            if local != engine.local:
-                raise RuntimeError(f"{status} message from "
-                                   f"{'local' if local else 'remote'} "
-                                   f"engine {eng_index}, expected it to be "
-                                   f"{'local' if engine.local else 'remote'}")
-
-            if status == "HELLO" and engine.state == CoreEngineState.NEW:
-
-                # Send init message with DP config info.
-                init_message = self.encoder.encode({
-                    "output_socket_address": output_address,
-                    "parallel_config": {
-                        "data_parallel_master_ip":
-                        parallel_config.data_parallel_master_ip,
-                        "data_parallel_master_port":
-                        parallel_config.data_parallel_master_port,
-                        "data_parallel_size":
-                        parallel_config.data_parallel_size,
-                    },
-                })
-                sync_input_socket.send_multipart((eng_identity, *init_message),
-                                                 copy=False)
-                conn_pending[0 if local else 1] -= 1
-                start_pending[0 if local else 1] += 1
-                engine.state = CoreEngineState.CONNECTED
-            elif status == "READY" and (engine.state
-                                        == CoreEngineState.CONNECTED):
-                # Setup KV cache config with initialization state from
-                # engine core process. Sum values from all engines in DP case.
-                cache_config = self.vllm_config.cache_config
-                num_gpu_blocks = cache_config.num_gpu_blocks or 0
-                num_gpu_blocks += msg['num_gpu_blocks']
-                cache_config.num_gpu_blocks = num_gpu_blocks
-
-                start_pending[0 if local else 1] -= 1
-                engine.state = CoreEngineState.READY
-            else:
-                raise RuntimeError(f"Unexpected {status} message for "
-                                   f"{'local' if local else 'remote'} engine "
-                                   f"{eng_index} in {engine.state} state.")
+        if len(self.core_engines) > 1:
+            self.resources.coordinator = DPCoordinator(parallel_config)
+
+        handshake_address = get_engine_client_zmq_addr(
+            local_only, host, parallel_config.data_parallel_rpc_port)
+
+        with zmq_socket_ctx(handshake_address, zmq.ROUTER,
+                            bind=True) as handshake_socket:
+
+            # Start local engines.
+            if local_engine_count:
+                # In server mode, start_index and local_start_index will
+                # both be 0.
+                self.resources.engine_manager = CoreEngineProcManager(
+                    EngineCoreProc.run_engine_core,
+                    vllm_config=vllm_config,
+                    executor_class=executor_class,
+                    log_stats=log_stats,
+                    handshake_address=handshake_address,
+                    on_head_node=True,
+                    local_engine_count=local_engine_count,
+                    start_index=start_index,
+                    local_start_index=local_start_index)
+
+            # Wait for engine core process(es) to start.
+            self._wait_for_engine_startup(handshake_socket, input_address,
+                                          output_address)
+
+    def _wait_for_engine_startup(self, handshake_socket: zmq.Socket,
+                                 input_address: str, output_address: str):
+        addresses = EngineZmqAddresses(
+            inputs=[input_address],
+            outputs=[output_address],
+        )
 
-            logger.debug("%s from %s core engine process %s.", status,
-                         "local" if local else "remote", eng_index)
+        coordinator = self.resources.coordinator
+        if coordinator is not None:
+            addresses.coordinator_input, addresses.coordinator_output = (
+                coordinator.get_engine_socket_addresses())
+
+        proc_manager = self.resources.engine_manager
+        assert isinstance(proc_manager, (type(None), CoreEngineProcManager)), (
+            "_wait_for_engine_startup should only be "
+            "called with CoreEngineProcManager")
+
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            self.core_engines,
+            self.vllm_config.parallel_config,
+            self.vllm_config.cache_config,
+            proc_manager,
+            coordinator.proc if coordinator else None,
+        )
 
     def shutdown(self):
         # Terminate background resources.
@@ -601,8 +583,8 @@ class SyncMPClient(MPClient):
             try:
                 shutdown_socket.bind(shutdown_path)
                 poller = zmq.Poller()
-                poller.register(shutdown_socket)
-                poller.register(out_socket)
+                poller.register(shutdown_socket, zmq.POLLIN)
+                poller.register(out_socket, zmq.POLLIN)
                 while True:
                     socks = poller.poll()
                     if not socks:
@@ -664,7 +646,7 @@ class SyncMPClient(MPClient):
         future: Future[Any] = Future()
         self.utility_results[call_id] = future
         self._send_input(EngineCoreRequestType.UTILITY,
-                         (call_id, method, args))
+                         (0, call_id, method, args))
 
         return future.result()
 
@@ -726,15 +708,21 @@ class SyncMPClient(MPClient):
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
-                 log_stats: bool):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=log_stats,
+            client_addresses=client_addresses,
         )
 
+        self.client_index = client_index
         self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
                                                  Exception]]()
         try:
@@ -854,12 +842,13 @@ class AsyncMPClient(MPClient):
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
         message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
-            (call_id, method, args)))
+            (self.client_index, call_id, method, args)))
         await self._send_input_message(message, engine, args)
         self._ensure_output_queue_task()
         return await future
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
+        request.client_index = self.client_index
         await self._send_input(EngineCoreRequestType.ADD, request)
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
@@ -920,17 +909,125 @@ class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
     EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
-                 log_stats: bool):
-
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
         self.current_wave = 0
         self.engines_running = False
+        # To route aborts to the correct engine.
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
-        super().__init__(vllm_config, executor_class, log_stats)
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
 
         assert len(self.core_engines) > 1
 
+        # List of [waiting, running] pair per engine.
+        self.lb_engines: list[list[int]] = []
+
+        self.first_req_sock_addr = get_open_zmq_inproc_path()
+        self.first_req_send_socket = self.resources.first_req_send_socket = (
+            make_zmq_socket(self.ctx,
+                            self.first_req_sock_addr,
+                            zmq.PAIR,
+                            bind=True))
+        try:
+            # If we are running in an asyncio event loop, start the stats task.
+            # Otherwise, it will be started lazily.
+            asyncio.get_running_loop()
+            self._ensure_stats_update_task()
+        except RuntimeError:
+            pass
+
+    def _ensure_stats_update_task(self):
+        resources = self.resources
+        if resources.stats_update_task is not None:
+            return
+
+        assert self.stats_update_address is not None
+
+        async def run_engine_stats_update_task():
+            with make_zmq_socket(self.ctx, self.stats_update_address,
+                                 zmq.XSUB) as socket, make_zmq_socket(
+                                     self.ctx,
+                                     self.first_req_sock_addr,
+                                     zmq.PAIR,
+                                     bind=False) as first_req_rcv_socket:
+                # Send subscription message.
+                await socket.send(b'\x01')
+
+                poller = zmq.asyncio.Poller()
+                poller.register(socket, zmq.POLLIN)
+                poller.register(first_req_rcv_socket, zmq.POLLIN)
+
+                while True:
+                    events = await poller.poll()
+                    if not self.engines_running and len(events) == 2 or (
+                            events[0][0] == first_req_rcv_socket):
+                        # Send a message to notify the coordinator that
+                        # we're sending a request while the engines are
+                        # paused, so that it can wake the others up
+                        # (to run dummy EP loop).
+                        self.engines_running = True
+                        buf = first_req_rcv_socket.recv(
+                            flags=zmq.NOBLOCK).result()
+                        target_eng_index = int.from_bytes(buf, "little")
+                        msg = msgspec.msgpack.encode(
+                            (target_eng_index, self.current_wave))
+                        await socket.send(msg)
+
+                    buf = None
+                    while True:
+                        # Drain all stats events (we only care about latest).
+                        future: asyncio.Future[bytes] = socket.recv(
+                            flags=zmq.NOBLOCK)
+                        if isinstance(future.exception(), zmq.Again):
+                            break
+                        buf = future.result()
+                    if buf is None:
+                        continue
+
+                    # Update local load-balancing state.
+                    counts, wave, running = msgspec.msgpack.decode(buf)
+                    self.current_wave = wave
+                    self.engines_running = running
+                    self.lb_engines = counts
+
+        resources.stats_update_task = asyncio.create_task(
+            run_engine_stats_update_task())
+
+    def get_core_engine_for_request(self,
+                                    dp_rank: Optional[int] = None
+                                    ) -> CoreEngine:
+        if dp_rank is not None:
+            # engines are already in rank order
+            return self.core_engines[dp_rank]
+
+        if not self.lb_engines:
+            return self.core_engines[0]
+        # TODO use P2C alg for larger DP sizes
+        num_engines = len(self.lb_engines)
+        min_counts = [sys.maxsize, sys.maxsize]
+        eng_index = 0
+        for i in range(num_engines):
+            # Start from client_index to help with balancing when engines
+            # are empty.
+            idx = (self.client_index + i) % num_engines
+            counts = self.lb_engines[idx]
+            if counts < min_counts:
+                min_counts = counts
+                eng_index = idx
+        # Adjust local counts for better balancing between stats updates
+        # from the coordinator (which happen every 100ms).
+        if min_counts[0]:
+            min_counts[0] += 1
+        else:
+            min_counts[1] += 1
+        return self.core_engines[eng_index]
+
     async def call_utility_async(self, method: str, *args) -> Any:
         # Only the result from the first engine is returned.
         return (await asyncio.gather(*[
@@ -939,62 +1036,31 @@ class DPAsyncMPClient(AsyncMPClient):
         ]))[0]
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
+        self._ensure_stats_update_task()
+
         request.current_wave = self.current_wave
+        request.client_index = self.client_index
 
-        chosen_engine = self.get_core_engine_for_request()
+        chosen_engine = self.get_core_engine_for_request(
+            request.data_parallel_rank)
         self.reqs_in_flight[request.request_id] = chosen_engine
-        chosen_engine.num_reqs_in_flight += 1
 
         to_await = self._send_input(EngineCoreRequestType.ADD, request,
                                     chosen_engine)
         if not self.engines_running:
-            # Send request to chosen engine and dp start loop
-            # control message to all other engines.
-            self.engines_running = True
-            to_await = asyncio.gather(
-                to_await,  # type: ignore[assignment]
-                *self._start_wave_coros(exclude_index=chosen_engine.index))
+            # Notify coordinator that we're sending a request
+            await self.first_req_send_socket.send(chosen_engine.identity)
 
         await to_await
 
         self._ensure_output_queue_task()
 
-    def get_core_engine_for_request(self) -> CoreEngine:
-        return min(self.core_engines, key=lambda e: e.num_reqs_in_flight)
-
     @staticmethod
     async def process_engine_outputs(self: "DPAsyncMPClient",
                                      outputs: EngineCoreOutputs):
-        if self.reqs_in_flight:
-            for req_id in outputs.finished_requests or ():
-                if engine := self.reqs_in_flight.pop(req_id, None):
-                    engine.num_reqs_in_flight -= 1
-
-        if outputs.wave_complete is not None:
-            # Current wave is complete, move to next wave number
-            # and mark engines as paused.
-            if self.current_wave <= outputs.wave_complete:
-                self.current_wave = outputs.wave_complete + 1
-                self.engines_running = False
-
-        elif outputs.start_wave is not None and (
-                outputs.start_wave > self.current_wave or
-            (outputs.start_wave == self.current_wave
-             and not self.engines_running)):
-            # Engine received request for a non-current wave so we must ensure
-            # that other engines progress to the next wave.
-            self.current_wave = outputs.start_wave
-            self.engines_running = True
-            await asyncio.gather(*self._start_wave_coros(
-                exclude_index=outputs.engine_index))
-
-    def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]:
-        logger.debug("Sending start DP wave %d.", self.current_wave)
-        return [
-            self._send_input(EngineCoreRequestType.START_DP_WAVE,
-                             self.current_wave, engine)
-            for engine in self.core_engines if engine.index != exclude_index
-        ]
+        if outputs.finished_requests and self.reqs_in_flight:
+            for req_id in outputs.finished_requests:
+                self.reqs_in_flight.pop(req_id, None)
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if not request_ids:
@@ -1019,3 +1085,50 @@ class DPAsyncMPClient(AsyncMPClient):
         if not self.resources.engine_dead:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids,
                                    engine)
+
+
+class RayDPClient(DPAsyncMPClient):
+    """
+    Ray-based client for multi-proc, multi-engine (data parallel)
+    EngineCore.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ):
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
+
+    def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool,
+                             local_start_index: int, input_address: str,
+                             output_address: str,
+                             executor_class: type[Executor], log_stats: bool):
+        """Self-contained client mode, launch engine and coordinator process
+        as needed."""
+
+        parallel_config = vllm_config.parallel_config
+        assert parallel_config.data_parallel_rank == 0
+        assert local_start_index == 0
+
+        addresses = EngineZmqAddresses(
+            inputs=[input_address],
+            outputs=[output_address],
+        )
+
+        if len(self.core_engines) > 1:
+            coordinator = DPCoordinator(parallel_config)
+            self.resources.coordinator = coordinator
+            addresses.coordinator_input, addresses.coordinator_output = (
+                coordinator.get_engine_socket_addresses())
+
+        # Start all engines.
+        self.resources.engine_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=executor_class,
+            log_stats=log_stats)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index dca327cc5d07bde3fc490249aabff55c362888e5..c6fe2d339c93d0306262c2365cb66074cfb57f52 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 97dd31d5e5218f7353d59cc1d17f7dbb2f826114..692ba9dc840f8bc2ac3a5803ec3214de4e90f7d3 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 class EngineGenerateError(Exception):
     """Raised when a AsyncLLM.generate() fails. Recoverable."""
     pass
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c856e2645a2c91b2d173f0ff604316db67f33cfe..736ffd8b40f00051e4b63e75cd51e6e7293af2c9 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
 from copy import copy
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 03d82b6bbc1d6c375db158cc0d66bef8db0cd50a..edc3be5b0120eaff46414c81fc901ebbe50178d1 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index fcb90bebdb627e97416532feb88ffb595855613a..abe98a13dfd3ebdc8afeea3650bf71b05182301f 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Optional
 
@@ -34,8 +35,8 @@ class MirroredProcessingCache:
 
     def __init__(self, model_config):
         mm_config = model_config.multimodal_config
-        disable_mm_preprocessor_cache = mm_config is not None and \
-            not mm_config.disable_mm_preprocessor_cache
+        disable_mm_preprocessor_cache = (
+            mm_config is not None and mm_config.disable_mm_preprocessor_cache)
         self.use_cache = not disable_mm_preprocessor_cache
         self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                       MultiModalKwargs)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 293c291b43410899c92b78b52aebc45f69b8bf27..1dcfbab30cfb301a0d79fc8d0ee49e82816c7166 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from collections.abc import Iterable
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 4df7ca59731ec2bdaa6f0aebc201c2aa0ca6b7a3..1e9911152c6df347aed12787764621e87b4b2beb 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import copy
 from typing import Optional
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 64a756148780d1d0dac33bf33060bc3fc8312a9c..e28879d40460e6050b70bb243fb28516d87ccdba 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections.abc import Mapping, Sequence
@@ -211,6 +212,7 @@ class Processor:
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> tuple[Optional[str], EngineCoreRequest]:
 
         # TODO(woosuk): Support pooling models.
@@ -224,6 +226,12 @@ class Processor:
         if prompt_adapter_request is not None:
             raise ValueError("V1 does not support prompt_adapter_request.")
 
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
+                                                   data_parallel_size):
+            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
+                             f"is out of range [0, {data_parallel_size}).")
+
         if arrival_time is None:
             arrival_time = time.time()
 
@@ -327,6 +335,7 @@ class Processor:
             arrival_time=arrival_time,
             lora_request=lora_request,
             cache_salt=decoder_inputs.get("cache_salt"),
+            data_parallel_rank=data_parallel_rank,
         )
 
     def _validate_model_inputs(self,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 3b9feb0d32980c2946595c48b8d3ecd4ee520852..50b9634a49e1b638ac278bab8b0a3501105b0468 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
 from typing import Callable, Union
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index eb5f9d4bfe004a295b6125413208bd391a8bc319..2148680d5f565ebd963e3a8c26d46cebcc6c0e04 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
 import os
 import pickle
@@ -19,6 +20,7 @@ from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
@@ -71,7 +73,10 @@ class MultiprocExecutor(Executor):
 
         # Initialize worker and set up message queues for SchedulerOutputs
         # and ModelRunnerOutputs
-        self.rpc_broadcast_mq = MessageQueue(self.world_size, self.world_size)
+        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+        self.rpc_broadcast_mq = MessageQueue(self.world_size,
+                                             self.world_size,
+                                             max_chunk_bytes=max_chunk_bytes)
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index 320ebfd37ae375fb75a70e76169ef808ae23abc5..257564793cf4e52fcf74a35194d9a42bc1230280 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
 from typing import Union
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 2747fc7fabd1e45f000f2dfd61817c61c7cfa215..e938f3bfc6714fd093f23d5e05969d711a3761a6 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from dataclasses import dataclass
@@ -156,11 +157,10 @@ class SlidingWindowSpec(AttentionSpec):
 @dataclass
 class KVCacheTensor:
     """
-    A dataclass for specifying how the workers should initialize the KV cache
-    for a layer. Only contains the size of KV cache for that layer for now. Will
-    be extended to support multiple layers sharing the same memory pool.
+    A class for specifying how the workers should initialize the KV cache.
     """
-    size: int  # The size of KV cache Tensor in bytes
+    size: int  # size of the KV cache tensor in bytes
+    shared_by: list[str]  # layer names that share the same KV cache tensor
 
 
 @dataclass
@@ -182,27 +182,13 @@ class KVCacheConfig:
     """
     """The number of KV cache blocks"""
     num_blocks: int
-    """layer_name -> how to initialize KV cache for that layer"""
-    tensors: dict[str, KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_tensors: list[KVCacheTensor]
     """
     The kv cache groups of the model.
-    The layers in the models are repeated with some patterns, e.g., a model
-    with 10 full attention layers and 20 sliding window attention layers can be
-    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
-    The KVCacheManager allocates different block tables for each of the 3 layers
-    in the pattern, and repeats each of them 10 times to generate the 
-    block_table for the 30 layers in the model.
-    Therefore, we can group the layers in the model into 3 groups, each of which
-    contains 10 layers in the model.
-    The KVCacheManager allocates the block_table for each group based on its
-    kv_cache spec, and the model runner applies the block table to each layer 
-    in the group.
-    For example:
-    1. A model only uses full attention. The pattern is 
-    (num_hidden_layers * full), so there is only one group and the block table 
-    is shared by all layers.
-    2. (WIP) A model with 10 full attention layers and 20 sliding window 
-    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
-    there are 3 groups, each of which represents 10 layers in the model.
+    For models with only one type of attention, there is only one group that
+    contains all layers.
+    For models with multiple types of attention, there will be multiple groups,
+    see `_get_kv_cache_config_uniform_page_size` for more details.
     """
     kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3dc2f77444f63b319e7a1b8b05d4264351f00136..2d621ec31038f2b91c00a19b913216fea7df5cdb 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import time
@@ -12,13 +13,12 @@ from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
+from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
 
 logger = init_logger(__name__)
 
-_LOCAL_LOGGING_INTERVAL_SEC = 5.0
-
 StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
 
 
@@ -35,7 +35,7 @@ class StatLoggerBase(ABC):
         ...
 
     @abstractmethod
-    def record(self, scheduler_stats: SchedulerStats,
+    def record(self, scheduler_stats: Optional[SchedulerStats],
                iteration_stats: Optional[IterationStats]):
         ...
 
@@ -78,20 +78,22 @@ class LoggingStatLogger(StatLoggerBase):
         # Compute summary metrics for tracked stats
         return float(np.sum(tracked_stats) / (now - self.last_log_time))
 
-    def record(self, scheduler_stats: SchedulerStats,
+    def record(self, scheduler_stats: Optional[SchedulerStats],
                iteration_stats: Optional[IterationStats]):
         """Log Stats to standard output."""
 
         if iteration_stats:
             self._track_iteration_stats(iteration_stats)
 
-        self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
+        if scheduler_stats is not None:
+            self.prefix_caching_metrics.observe(
+                scheduler_stats.prefix_cache_stats)
 
-        if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_logging.observe(
-                scheduler_stats.spec_decoding_stats)
+            if scheduler_stats.spec_decoding_stats is not None:
+                self.spec_decoding_logging.observe(
+                    scheduler_stats.spec_decoding_stats)
 
-        self.last_scheduler_stats = scheduler_stats
+            self.last_scheduler_stats = scheduler_stats
 
     def log(self):
         now = time.monotonic()
@@ -131,10 +133,11 @@ class LoggingStatLogger(StatLoggerBase):
         self.spec_decoding_logging.log(log_fn=log_fn)
 
     def log_engine_initialized(self):
-        logger.info(
-            "vllm cache_config_info with initialization " \
-            "after num_gpu_blocks is: %d",
-            self.vllm_config.cache_config.num_gpu_blocks)
+        if self.vllm_config.cache_config.num_gpu_blocks:
+            logger.info(
+                "Engine %03d: vllm cache_config_info with initialization "
+                "after num_gpu_blocks is: %d", self.engine_index,
+                self.vllm_config.cache_config.num_gpu_blocks)
 
 
 class PrometheusStatLogger(StatLoggerBase):
@@ -144,7 +147,8 @@ class PrometheusStatLogger(StatLoggerBase):
     _spec_decoding_cls = SpecDecodingProm
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
-        self._unregister_vllm_metrics()
+
+        unregister_vllm_metrics()
         self.vllm_config = vllm_config
         self.engine_index = engine_index
         # Use this flag to hide metrics that were deprecated in
@@ -169,11 +173,13 @@ class PrometheusStatLogger(StatLoggerBase):
         self.gauge_scheduler_running = self._gauge_cls(
             name="vllm:num_requests_running",
             documentation="Number of requests in model execution batches.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames).labels(*labelvalues)
 
         self.gauge_scheduler_waiting = self._gauge_cls(
             name="vllm:num_requests_waiting",
             documentation="Number of requests waiting to be processed.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames).labels(*labelvalues)
 
         #
@@ -182,6 +188,7 @@ class PrometheusStatLogger(StatLoggerBase):
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames).labels(*labelvalues)
 
         self.counter_gpu_prefix_cache_queries = self._counter_cls(
@@ -242,6 +249,9 @@ class PrometheusStatLogger(StatLoggerBase):
                 buckets=build_1_2_5_buckets(max_model_len),
                 labelnames=labelnames).labels(*labelvalues)
 
+        # TODO: This metric might be incorrect in case of using multiple
+        # api_server counts which uses prometheus mp.
+        # See: https://github.com/vllm-project/vllm/pull/18053
         self.histogram_iteration_tokens = \
             self._histogram_cls(
                 name="vllm:iteration_tokens_total",
@@ -340,6 +350,9 @@ class PrometheusStatLogger(StatLoggerBase):
         #
         # LoRA metrics
         #
+
+        # TODO: This metric might be incorrect in case of using multiple
+        # api_server counts which uses prometheus mp.
         self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
         if vllm_config.lora_config is not None:
             self.labelname_max_lora = "max_lora"
@@ -350,13 +363,16 @@ class PrometheusStatLogger(StatLoggerBase):
                 self._gauge_cls(
                     name="vllm:lora_requests_info",
                     documentation="Running stats on lora requests.",
+                    multiprocess_mode="sum",
                     labelnames=[
                         self.labelname_max_lora,
                         self.labelname_waiting_lora_adapters,
                         self.labelname_running_lora_adapters,
-                    ])
+                    ],
+                )
 
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
+
         metrics_info = config_obj.metrics_info()
         metrics_info["engine"] = self.engine_index
 
@@ -372,25 +388,28 @@ class PrometheusStatLogger(StatLoggerBase):
         info_gauge = self._gauge_cls(
             name=name,
             documentation=documentation,
-            labelnames=metrics_info.keys()).labels(**metrics_info)
+            multiprocess_mode="mostrecent",
+            labelnames=metrics_info.keys(),
+        ).labels(**metrics_info)
         info_gauge.set(1)
 
-    def record(self, scheduler_stats: SchedulerStats,
+    def record(self, scheduler_stats: Optional[SchedulerStats],
                iteration_stats: Optional[IterationStats]):
         """Log to prometheus."""
-        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
-        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+        if scheduler_stats is not None:
+            self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+            self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
-        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
+            self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
 
-        self.counter_gpu_prefix_cache_queries.inc(
-            scheduler_stats.prefix_cache_stats.queries)
-        self.counter_gpu_prefix_cache_hits.inc(
-            scheduler_stats.prefix_cache_stats.hits)
+            self.counter_gpu_prefix_cache_queries.inc(
+                scheduler_stats.prefix_cache_stats.queries)
+            self.counter_gpu_prefix_cache_hits.inc(
+                scheduler_stats.prefix_cache_stats.hits)
 
-        if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_prom.observe(
-                scheduler_stats.spec_decoding_stats)
+            if scheduler_stats.spec_decoding_stats is not None:
+                self.spec_decoding_prom.observe(
+                    scheduler_stats.spec_decoding_stats)
 
         if iteration_stats is None:
             return
@@ -445,13 +464,6 @@ class PrometheusStatLogger(StatLoggerBase):
             self.gauge_lora_info.labels(**lora_info_labels)\
                                 .set_to_current_time()
 
-    @staticmethod
-    def _unregister_vllm_metrics():
-        # Unregister any existing vLLM collectors (for CI/CD
-        for collector in list(prometheus_client.REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                prometheus_client.REGISTRY.unregister(collector)
-
     def log_engine_initialized(self):
         self.log_metrics_info("cache_config", self.vllm_config.cache_config)
 
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
new file mode 100644
index 0000000000000000000000000000000000000000..61ba5d66cb31a473e43d7cfd6a6de12e23fc4491
--- /dev/null
+++ b/vllm/v1/metrics/prometheus.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+from typing import Optional
+
+from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# Global temporary directory for prometheus multiprocessing
+_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None
+
+
+def setup_multiprocess_prometheus():
+    """Set up prometheus multiprocessing directory if not already configured.
+    
+    """
+    global _prometheus_multiproc_dir
+
+    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
+        # Make TemporaryDirectory for prometheus multiprocessing
+        # Note: global TemporaryDirectory will be automatically
+        # cleaned up upon exit.
+        _prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
+        logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s",
+                     _prometheus_multiproc_dir.name)
+    else:
+        logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. "
+                       "This directory must be wiped between vLLM runs or "
+                       "you will find inaccurate metrics. Unset the variable "
+                       "and vLLM will properly handle cleanup.")
+
+
+def get_prometheus_registry():
+    """Get the appropriate prometheus registry based on multiprocessing 
+    configuration.
+    
+    Returns:
+        Registry: A prometheus registry
+    """
+    if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
+        logger.debug("Using multiprocess registry for prometheus metrics")
+        registry = CollectorRegistry()
+        multiprocess.MultiProcessCollector(registry)
+        return registry
+
+    return REGISTRY
+
+
+def unregister_vllm_metrics():
+    """Unregister any existing vLLM collectors from the prometheus registry.
+    
+    This is useful for testing and CI/CD where metrics may be registered
+    multiple times across test runs.
+    
+    Also, in case of multiprocess, we need to unregister the metrics from the 
+    global registry.
+    """
+    registry = REGISTRY
+    # Unregister any existing vLLM collectors
+    for collector in list(registry._collector_to_names):
+        if hasattr(collector, "_name") and "vllm" in collector._name:
+            registry.unregister(collector)
+
+
+def shutdown_prometheus():
+    """Shutdown prometheus metrics."""
+
+    path = _prometheus_multiproc_dir
+    if path is None:
+        return
+    try:
+        pid = os.getpid()
+        multiprocess.mark_process_dead(pid, path)
+        logger.debug("Marked Prometheus metrics for process %d as dead", pid)
+    except Exception as e:
+        logger.error("Error during metrics cleanup: %s", str(e))
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a51c3ed7f57204900dc4a09bc0039c387ae14034..cce692d6c09e7f4bb4b36029419e06aefd97e253 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
 from typing import Optional, Union
 
@@ -30,6 +31,16 @@ class RayPrometheusMetric:
 
             self.metric.set_default_tags(labelskwargs)
 
+        if labels:
+            if len(labels) != len(self.metric._tag_keys):
+                raise ValueError(
+                    "Number of labels must match the number of tag keys. "
+                    f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
+                )
+
+            self.metric.set_default_tags(
+                dict(zip(self.metric._tag_keys, labels)))
+
         return self
 
 
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
index 5ab78129a0094396300ce7dc23ec75ef7b1729cc..4d6e59984154115a1ad363cab0991ee957eaafed 100644
--- a/vllm/v1/metrics/reader.py
+++ b/vllm/v1/metrics/reader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 8fe1630616a47c78797540cddd08dcdef0cc3890..50c8b07fe54d262dad3c5f80ff5e8eb6a050ce5e 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from dataclasses import dataclass, field
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e8ce0df5ed8d28d51c3d2c2e0f8d28baa9dbed12..17a299d57cbaaae7a9ce9a3c8b5bf2a309869fe8 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index b4c84507532a1ada8d8cda2b6a7b21dfeb944851..53fd70fabecf327e4e4861800d67164db0cc484b 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from typing import TYPE_CHECKING, Any, Optional, Union
@@ -26,12 +27,13 @@ class Request:
         multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
-        arrival_time: float,
+        client_index: int = 0,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
     ) -> None:
         self.request_id = request_id
+        self.client_index = client_index
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -90,13 +92,13 @@ class Request:
 
         return cls(
             request_id=request.request_id,
+            client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
             multi_modal_inputs=request.mm_inputs,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
-            arrival_time=request.arrival_time,
             lora_request=request.lora_request,
             structured_output_request=StructuredOutputRequest(
                 sampling_params=request.sampling_params),
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index e97e1235fb36522ca46ba8605fb9137bfd43aa30..ab13b288a5a9b26c03b8e2c56df2243c7b7e7f93 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
index 2984d4e4806fe76ad58ec0a5db67486f0f05545e..1b699565f26f2c654aa223425756c6c58d05ae8c 100644
--- a/vllm/v1/sample/ops/bad_words.py
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index ed05e3f48401a59d37f35a23e6b99541d003bce4..48423b9b424ddc9cdbffae333c0ee47e76c464c6 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 4a5fbb10d408bddcf95ec347ee427b3710844810..30396f1594337995b60457f92220c4ca164b420e 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 17b870fede8e7f4aa36b90dfc22168e3563bbe37..b2354c53302adb8e8df5b8cff281657b58992fbc 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 16561d30a6dc309e2cc32d13cd18299be491e213..6bc0cecdd4940a8a859cc0800f4c99cfcb70c122 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 
 import torch
 import torch.nn as nn
 
+from vllm.utils import async_tensor_h2d, is_pin_memory_available
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words
@@ -19,6 +21,7 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.topk_topp_sampler = TopKTopPSampler()
+        self.pin_memory = is_pin_memory_available()
 
     def forward(
         self,
@@ -231,6 +234,10 @@ class Sampler(nn.Module):
         # One idea is implement this as a PyTorch C++ op, and we may
         # even optimize the logit_bias layout.
 
+        rows: list[int] = []
+        cols: list[int] = []
+        vals: list[float] = []
+
         # Get vocabulary size from logits
         vocab_size = logits.shape[-1]
 
@@ -243,7 +250,16 @@ class Sampler(nn.Module):
                             f"token_id {token_id} in logit_bias contains "
                             f"out-of-vocab token id. Vocabulary size: "
                             f"{vocab_size}")
-                    logits[i, token_id] += bias
+                    rows.append(i)
+                    cols.append(token_id)
+                    vals.append(bias)
+
+        if rows:
+            indices = async_tensor_h2d([rows, cols], torch.int64,
+                                       logits.device, self.pin_memory)
+            values = async_tensor_h2d(vals, torch.float, logits.device,
+                                      self.pin_memory)
+            logits.index_put_(tuple(indices), values=values, accumulate=True)
         return logits
 
     def apply_allowed_token_ids(
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index a1c7dcdb111f5000b2a0e5f904f80cd6af263a95..4c1ac4895197c7bf821a8bed16d684df63fb8b1c 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
 from typing import Optional
 
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 7c31a2984b30719dff94b6b18115ceb07f08013f..1056eb1d7b7fee76bbdb1b23fca53e1feea99b97 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
 import torch
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 0dcf02113f5a820476bd3a534caa3b3defb5c65b..ab6653a786ffeacee2d982ef9da0efa467a97a55 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import pickle
@@ -158,10 +159,8 @@ class MsgpackEncoder:
         self, obj: torch.Tensor
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
-        # this creates a copy of the tensor if it's not already contiguous
-        obj = obj.contiguous()
-        #  view the tensor as a 1D array of bytes
-        arr = obj.view((obj.numel(), )).view(torch.uint8).numpy()
+        # view the tensor as a contiguous 1D array of bytes
+        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
@@ -169,7 +168,7 @@ class MsgpackEncoder:
             # Otherwise encode index of backing buffer to avoid copy.
             data = len(self.aux_buffers)
             self.aux_buffers.append(arr.data)
-        dtype = str(obj.dtype)[6:]  # remove 'torch.' prefix
+        dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
@@ -245,7 +244,7 @@ class MsgpackDecoder:
         # zero-copy decode. We assume the ndarray will not be kept around,
         # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
-        return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
+        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
@@ -254,12 +253,15 @@ class MsgpackDecoder:
         # not complain about a readonly memoryview.
         buffer = self.aux_buffers[data] if isinstance(data, int) \
             else bytearray(data)
-        # Create numpy wrapper around the bytes
-        arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), ))
         torch_dtype = getattr(torch, dtype)
         assert isinstance(torch_dtype, torch.dtype)
+        if not buffer:  # torch.frombuffer doesn't like empty buffers
+            assert 0 in shape
+            return torch.empty(shape, dtype=torch_dtype)
+        # Create uint8 array
+        arr = torch.frombuffer(buffer, dtype=torch.uint8)
         # Convert back to proper shape & type
-        return torch.from_numpy(arr).view(torch_dtype).view(shape)
+        return arr.view(torch_dtype).view(shape)
 
     def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
         decoded_items = []
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 971b06758c21415cf0ba88a844e2b098e98d59a9..4b5c9b7ec640e8e32f0b5a1b5944845df5763cad 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 import torch.nn as nn
 
@@ -9,6 +10,7 @@ from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata,
                                                    FlashAttentionMetadata)
@@ -328,16 +330,19 @@ class EagleProposer:
         self.attn_layer_names = list(draft_attn_layer_names)
 
         # share embed_tokens with the target model if needed
-        if get_pp_group().world_size == 1:
+        if get_pp_group().world_size == 1 \
+            and self.model.model.embed_tokens.weight.shape \
+                == target_model.model.embed_tokens.weight.shape:
             logger.info(
-                "The EAGLE head shares the same vocab embedding" \
+                "Assuming the EAGLE head shares the same vocab embedding" \
                 " with the target model."
             )
+            del self.model.model.embed_tokens
             self.model.model.embed_tokens = target_model.model.embed_tokens
         else:
             logger.info(
-                "Since PP > 1, the EAGLE head loaded its own vocab embedding" \
-                " weights instead of sharing them with the target model."
+                "The EAGLE head's vocab embedding will be loaded separately" \
+                " from the target model."
             )
 
         # share lm_head with the target model if needed
@@ -346,7 +351,10 @@ class EagleProposer:
         if self.vllm_config.speculative_config.method != "eagle3" and \
                 hasattr(target_model, "lm_head"):
             logger.info("Loading EAGLE LM head weights from the target model.")
-            self.model.lm_head = target_model.lm_head
+            if supports_multimodal(target_model):
+                self.model.lm_head = target_model.get_language_model().lm_head
+            else:
+                self.model.lm_head = target_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index fdac2ef64c3f73b5e4e900025c104eda7426f154..f516bf486b8b52a918ad37a2ba334f4f940aeec1 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn as nn
diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py
index 1cf650d5fa5699e850a930eb4d426bc45db84b2f..b1efb40612d541f04ceaa682a3e2ad6fbaf8d042 100644
--- a/vllm/v1/spec_decode/metadata.py
+++ b/vllm/v1/spec_decode/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import numpy as np
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 36091bef2895909f03388849bab819e8faa7e023..b4bc3058c570ac3cbf5d27ba6ad78c8d3758c211 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 704153d43a2b48033d29eeefe9b82c86f472f9a7..6b90d0970bd77054c91fa90a009d661d72bcbc2f 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import numpy as np
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 334258e7f87ae01aea59f402202bb8b6f2519364..5c37333cebc7ac19d047238cee62698e00f82d59 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.triton_utils import tl, triton
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index c701ab1d35a58f70560f5762b184fe387268c008..b2b0ee796954325279c23d8108e9fefefff9de71 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import multiprocessing
@@ -149,31 +150,37 @@ class StructuredOutputManager:
         # NOTE: This outer loop can likely be parallelized to improve
         # performance of bitmask generation for large batches.
         for req_id, _ in ordered_seq:
-            request = requests[req_id].structured_output_request
-            if TYPE_CHECKING:
-                assert request is not None
-                assert request.grammar is not None
+            request = requests[req_id]
+            structured_output_request = request.structured_output_request
 
-            apply_bitmask = (
-                request.reasoning_ended if self.reasoner is not None else True
-            )  # noqa: E501
+            if TYPE_CHECKING:
+                assert structured_output_request is not None
+                assert structured_output_request.grammar is not None
+            apply_bitmask: bool = True
+            if self.reasoner is not None:
+                if structured_output_request.reasoning_ended is None:
+                    structured_output_request.reasoning_ended = \
+                        self.reasoner.is_reasoning_end(request.prompt_token_ids)
+                apply_bitmask = structured_output_request.reasoning_ended
 
             state_advancements = 0
             req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None]
             for i, token in enumerate(req_tokens):
-                if apply_bitmask and not request.grammar.is_terminated():
-                    request.grammar.fill_bitmask(bitmask_tensor,
-                                                 cumulative_index)
+                if apply_bitmask and not \
+                    structured_output_request.grammar.is_terminated():
+                    structured_output_request.grammar.fill_bitmask(
+                        bitmask_tensor, cumulative_index)
                     if token is not None:
                         # In order to generate the correct bitmask for each
                         # position in the speculative sequence, we advance
                         # the FSM state for each speculative token and rollback
                         # to restore the previous state when we are finished.
-                        assert request.grammar.accept_tokens(req_id, [token])
+                        assert structured_output_request.grammar.accept_tokens(
+                            req_id, [token])
                         state_advancements += 1
                 cumulative_index += 1
             if state_advancements > 0:
-                request.grammar.rollback(state_advancements)
+                structured_output_request.grammar.rollback(state_advancements)
 
         if cumulative_index < bitmask_tensor.shape[0]:
             bitmask_tensor = bitmask_tensor[:cumulative_index]
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 55c5f609095d7b25d96bd9a765d4d43477b5eee3..02e7fc33f517de6b25429878e809b273ac67a2ea 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 09f6cdf7333720019260e30ae01efee3bc8c2f6f..d500783aa4b30c6fd7c969178cc6c246d3646796 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index f2570221da2528de78cc4fe819a929d7303053cf..88544565e5443eaf4e2f5e6575f90617c090c85b 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index c16320b9e74c6bc9c14ce0c8fc4bcb0e610e3ea8..fc365f12573fc0377ed04cdfb82a5ebd223b1ea2 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import dataclasses
@@ -20,7 +21,7 @@ class StructuredOutputRequest:
     sampling_params: SamplingParams
     _grammar: Optional[Union[Future[StructuredOutputGrammar],
                              StructuredOutputGrammar]] = None
-    reasoning_ended: bool = False
+    reasoning_ended: Optional[bool] = None
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 111e92dc0990d144579058ca050ed3c42758a73e..7adee7237bd1258971ab4c863a70c6cd15fa187f 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 0758747a83cc6831e3c9beb65b161eb0e4f24453..192c9067740c23b7c621f3adae2ec75ed36319cb 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,31 +1,44 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
+import argparse
+import multiprocessing
 import time
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import Enum, auto
 from multiprocessing import Process, connection
-from typing import (TYPE_CHECKING, Callable, Generic, Optional, TypeVar, Union,
-                    overload)
+from multiprocessing.process import BaseProcess
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
+                    Union, overload)
 
+import msgspec
 import torch
+import zmq
 
-from vllm.config import VllmConfig
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import get_mp_context, kill_process_tree
+from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path,
+                        get_tcp_uri, kill_process_tree)
 from vllm.v1.executor.abstract import Executor
 
 if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
     from vllm.attention.layer import Attention
+    from vllm.v1.engine.coordinator import DPCoordinator
 
 logger = init_logger(__name__)
 
 T = TypeVar("T")
 
+STARTUP_POLL_PERIOD_MS = 10000
+
 
 class ConstantList(Generic[T], Sequence):
 
@@ -95,6 +108,117 @@ class ConstantList(Generic[T], Sequence):
         return f"ConstantList({self._x})"
 
 
+def get_engine_client_zmq_addr(local_only: bool,
+                               host: str,
+                               port: int = 0) -> str:
+    return get_open_zmq_ipc_path() if local_only else (get_tcp_uri(
+        host, port or get_open_port()))
+
+
+class CoreEngineState(Enum):
+    NEW = auto()
+    CONNECTED = auto()
+    READY = auto()
+
+
+class CoreEngine:
+    """One per data parallel rank."""
+
+    def __init__(self, index: int = 0, local: bool = True):
+        self.local = local
+        self.index = index
+        self.identity = index.to_bytes(2, "little")
+
+        self.state = CoreEngineState.NEW
+
+
+@dataclass
+class EngineZmqAddresses:
+    # ZMQ input socket addresses for each front-end client (requests)
+    inputs: list[str]
+    # ZMQ output socket addresses for each front-end client (responses)
+    outputs: list[str]
+    # ZMQ input socket address of DP coordinator if applicable
+    coordinator_input: Optional[str] = None
+    # ZMQ output socket address of DP coordinator if applicable
+    coordinator_output: Optional[str] = None
+
+
+@dataclass
+class EngineHandshakeMetadata:
+    """Metadata sent to each engine process during startup handshake,
+    including addresses of the front-end ZMQ queues that they should
+    connect to.
+    """
+    addresses: EngineZmqAddresses
+    parallel_config: dict[str, Union[int, str]]
+
+
+class APIServerProcessManager:
+    """Manages a group of API server processes.
+    
+    Handles creation, monitoring, and termination of API server worker
+    processes. Also monitors extra processes to check if they are healthy.
+    """
+
+    def __init__(
+        self,
+        target_server_fn: Callable,
+        listen_address: str,
+        sock: Any,
+        args: argparse.Namespace,
+        num_servers: int,
+        input_addresses: list[str],
+        output_addresses: list[str],
+        stats_update_address: Optional[str] = None,
+    ):
+        """Initialize and start API server worker processes.
+        
+        Args:
+            target_server_fn: Function to call for each API server process
+            listen_address: Address to listen for client connections
+            sock: Socket for client connections
+            args: Command line arguments
+            num_servers: Number of API server processes to start
+            input_addresses: Input addresses for each API server
+            output_addresses: Output addresses for each API server
+            stats_update_address: Optional stats update address 
+        """
+        self.listen_address = listen_address
+        self.sock = sock
+        self.args = args
+
+        # Start API servers
+        spawn_context = multiprocessing.get_context("spawn")
+        self.processes: list[BaseProcess] = []
+
+        for i, in_addr, out_addr in zip(range(num_servers), input_addresses,
+                                        output_addresses):
+            client_config = {
+                "input_address": in_addr,
+                "output_address": out_addr,
+                "client_index": i
+            }
+            if stats_update_address is not None:
+                client_config["stats_update_address"] = stats_update_address
+
+            proc = spawn_context.Process(target=target_server_fn,
+                                         name=f"ApiServer_{i}",
+                                         args=(listen_address, sock, args,
+                                               client_config))
+            self.processes.append(proc)
+            proc.start()
+
+        logger.info("Started %d API server processes", len(self.processes))
+
+        # Shutdown only the API server processes on garbage collection
+        # The extra processes are managed by their owners
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
+
+    def close(self) -> None:
+        self._finalizer()
+
+
 class CoreEngineProcManager:
     """
     Utility class to handle creation, readiness, and shutdown
@@ -109,7 +233,7 @@ class CoreEngineProcManager:
         local_start_index: int,
         vllm_config: VllmConfig,
         on_head_node: bool,
-        input_address: str,
+        handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
     ):
@@ -117,12 +241,12 @@ class CoreEngineProcManager:
         common_kwargs = {
             "vllm_config": vllm_config,
             "on_head_node": on_head_node,
-            "input_address": input_address,
+            "handshake_address": handshake_address,
             "executor_class": executor_class,
             "log_stats": log_stats,
         }
 
-        self.processes: list[Process] = []
+        self.processes: list[BaseProcess] = []
         for index in range(local_engine_count):
             local_index = local_start_index + index
             global_index = start_index + index
@@ -135,8 +259,7 @@ class CoreEngineProcManager:
                                     "local_dp_rank": local_index,
                                 }))
 
-        self._finalizer = weakref.finalize(self, shutdown, self.processes,
-                                           input_address)
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
         try:
             for proc in self.processes:
                 proc.start()
@@ -164,9 +287,341 @@ class CoreEngineProcManager:
         }
 
 
+class CoreEngineActorManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of core engine Ray actors used by the AsyncLLM and LLMEngine.
+
+    Different from CoreEngineProcManager, this class manages
+    core engines for both local and remote nodes.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        placement_groups: Optional[list["PlacementGroup"]] = None,
+        local_dp_ranks: Optional[list[int]] = None,
+    ):
+        import copy
+
+        import ray
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
+
+        from vllm.v1.engine.core import DPEngineCoreActor
+
+        self.local_engine_actors: list[ray.ActorHandle] = []
+        self.remote_engine_actors: list[ray.ActorHandle] = []
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+        world_size = vllm_config.parallel_config.world_size
+
+        if ray.is_initialized():
+            logger.info(
+                "Ray is already initialized. Skipping Ray initialization.")
+        else:
+            ray.init()
+
+        if placement_groups is not None:
+            assert local_dp_ranks is not None, (
+                "local_dp_ranks must be provided if "
+                "placement_groups is provided")
+            assert len(placement_groups) == len(local_dp_ranks), (
+                "placement_groups and local_dp_ranks must "
+                "have the same length")
+            logger.info("Using provided placement groups")
+            # TODO(rui): validate passed-in placement groups
+            self.created_placement_groups = []
+        else:
+            placement_groups, local_dp_ranks = \
+                CoreEngineActorManager.create_dp_placement_groups(vllm_config)
+            self.created_placement_groups = placement_groups
+        assert len(placement_groups) == dp_size, (
+            "Number of placement groups must match data parallel size")
+
+        refs = []
+        for index in range(dp_size):
+            local_index = local_dp_ranks[index]
+            dp_vllm_config = copy.deepcopy(vllm_config)
+            pg = placement_groups[index]
+            dp_vllm_config.parallel_config.placement_group = pg
+            on_head_node = index < local_engine_count
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                )).remote(vllm_config=dp_vllm_config,
+                          executor_class=executor_class,
+                          log_stats=log_stats,
+                          on_head_node=on_head_node,
+                          addresses=addresses,
+                          dp_rank=index,
+                          local_dp_rank=local_index)
+            if on_head_node:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            refs.append(actor.wait_for_init.remote())
+
+        ray.get(refs)
+        self.run_refs = []
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            self.run_refs.append(actor.run.remote())
+
+    @staticmethod
+    def create_dp_placement_groups(
+            vllm_config: VllmConfig
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+
+        import ray
+        from ray._private.state import available_resources_per_node
+        from ray.util.state import list_nodes
+
+        logger.info("Creating placement groups for data parallel")
+        dp_master_ip = \
+            vllm_config.parallel_config.data_parallel_master_ip
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+
+        nodes = list_nodes()
+        nodes = sorted(list_nodes(),
+                       key=lambda node: node.node_ip != dp_master_ip)
+        assert nodes[0].node_ip == dp_master_ip, (
+            "The first node must be the head node")
+        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
+            "There can only be one head node")
+
+        available_resources = available_resources_per_node()
+        world_size = vllm_config.parallel_config.world_size
+        placement_groups: list[PlacementGroup] = []
+        local_dp_ranks: list[int] = []
+
+        for node in nodes:
+            node_ip = node.node_ip
+            node_resources = available_resources[node.node_id]
+            # For now, each DP rank can only be assigned to one node
+            # TODO(rui): support allocating a single DP rank
+            # to multiple nodes
+            available_engine_count = int(node_resources["GPU"]) // world_size
+            if node_ip == dp_master_ip:
+                assert available_engine_count >= local_engine_count, (
+                    "Not enough resources to allocate DP ranks "
+                    f"on DP master node {node_ip}")
+                for i in range(local_engine_count):
+                    bundles = [{
+                        "GPU": 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+            else:
+                for i in range(available_engine_count):
+                    if len(placement_groups) == dp_size:
+                        break
+                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+        return placement_groups, local_dp_ranks
+
+    def get_run_refs(self):
+        return self.run_refs
+
+    def close(self):
+        import ray
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            ray.kill(actor)
+        for pg in self.created_placement_groups:
+            ray.util.remove_placement_group(pg)
+
+
+def wait_for_engine_startup(
+    handshake_socket: zmq.Socket,
+    addresses: EngineZmqAddresses,
+    core_engines: list[CoreEngine],
+    parallel_config: ParallelConfig,
+    cache_config: CacheConfig,
+    proc_manager: Optional[CoreEngineProcManager],
+    coord_process: Optional[Process],
+):
+
+    # Wait for engine core process(es) to send ready messages.
+    local_count = parallel_config.data_parallel_size_local
+    remote_count = len(core_engines) - local_count
+    # [local, remote] counts
+    conn_pending, start_pending = [local_count, remote_count], [0, 0]
+    poller = zmq.Poller()
+    poller.register(handshake_socket, zmq.POLLIN)
+
+    if proc_manager is not None:
+        for sentinel in proc_manager.sentinels():
+            poller.register(sentinel, zmq.POLLIN)
+    if coord_process is not None:
+        poller.register(coord_process.sentinel, zmq.POLLIN)
+    while any(conn_pending) or any(start_pending):
+        events = poller.poll(STARTUP_POLL_PERIOD_MS)
+        if not events:
+            if any(conn_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to connect.", *conn_pending)
+            if any(start_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to start.", *start_pending)
+            continue
+        if len(events) > 1 or events[0][0] != handshake_socket:
+            # One of the local core processes exited.
+            finished = proc_manager.finished_procs() if proc_manager else {}
+            if coord_process is not None and coord_process.exitcode is not None:
+                finished[coord_process.name] = coord_process.exitcode
+            raise RuntimeError("Engine core initialization failed. "
+                               "See root cause above. "
+                               f"Failed core proc(s): {finished}")
+
+        # Receive HELLO and READY messages from the input socket.
+        eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
+        eng_index = int.from_bytes(eng_identity, "little")
+        engine = next((e for e in core_engines if e.identity == eng_identity),
+                      None)
+        if engine is None:
+            raise RuntimeError(f"Message from engine with unexpected data "
+                               f"parallel rank: {eng_index}")
+        msg = msgspec.msgpack.decode(ready_msg_bytes)
+        status, local = msg["status"], msg["local"]
+        if local != engine.local:
+            raise RuntimeError(f"{status} message from "
+                               f"{'local' if local else 'remote'} "
+                               f"engine {eng_index}, expected it to be "
+                               f"{'local' if engine.local else 'remote'}")
+
+        if status == "HELLO" and engine.state == CoreEngineState.NEW:
+
+            # Send init message with DP config info.
+            init_message = msgspec.msgpack.encode(
+                EngineHandshakeMetadata(
+                    addresses=addresses,
+                    parallel_config={
+                        "data_parallel_master_ip":
+                        parallel_config.data_parallel_master_ip,
+                        "data_parallel_master_port":
+                        parallel_config.data_parallel_master_port,
+                        "data_parallel_size":
+                        parallel_config.data_parallel_size,
+                    }))
+            handshake_socket.send_multipart((eng_identity, init_message),
+                                            copy=False)
+            conn_pending[0 if local else 1] -= 1
+            start_pending[0 if local else 1] += 1
+            engine.state = CoreEngineState.CONNECTED
+        elif status == "READY" and (engine.state == CoreEngineState.CONNECTED):
+            # Setup KV cache config with initialization state from
+            # engine core process. Sum values from all engines in DP case.
+            num_gpu_blocks = cache_config.num_gpu_blocks or 0
+            num_gpu_blocks += msg["num_gpu_blocks"]
+            cache_config.num_gpu_blocks = num_gpu_blocks
+
+            start_pending[0 if local else 1] -= 1
+            engine.state = CoreEngineState.READY
+        else:
+            raise RuntimeError(f"Unexpected {status} message for "
+                               f"{'local' if local else 'remote'} engine "
+                               f"{eng_index} in {engine.state} state.")
+
+        logger.debug("%s from %s core engine process %s.", status,
+                     "local" if local else "remote", eng_index)
+
+
+def wait_for_completion_or_failure(
+        api_server_manager: APIServerProcessManager,
+        engine_manager: Optional[Union[CoreEngineProcManager,
+                                       CoreEngineActorManager]] = None,
+        coordinator: Optional["DPCoordinator"] = None) -> None:
+    """Wait for all processes to complete or detect if any fail.
+    
+    Raises an exception if any process exits with a non-zero status.
+
+    Args:
+        api_server_manager: The manager for API servers.
+        engine_manager: The manager for engine processes.
+            If CoreEngineProcManager, it manages local engines;
+            if CoreEngineActorManager, it manages all engines.
+        coordinator: The coordinator for data parallel.
+    """
+
+    try:
+        logger.info("Waiting for API servers to complete ...")
+        # Create a mapping of sentinels to their corresponding processes
+        # for efficient lookup
+        sentinel_to_proc: dict[Any, BaseProcess] = {
+            proc.sentinel: proc
+            for proc in api_server_manager.processes
+        }
+
+        if coordinator:
+            sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc
+
+        actor_run_refs = []
+        if isinstance(engine_manager, CoreEngineProcManager):
+            for proc in engine_manager.processes:
+                sentinel_to_proc[proc.sentinel] = proc
+        elif isinstance(engine_manager, CoreEngineActorManager):
+            actor_run_refs = engine_manager.get_run_refs()
+
+        # Check if any process terminates
+        while sentinel_to_proc or actor_run_refs:
+            # Wait for any process to terminate
+            ready_sentinels: list[Any] = connection.wait(sentinel_to_proc,
+                                                         timeout=5)
+
+            # Process any terminated processes
+            for sentinel in ready_sentinels:
+                proc = sentinel_to_proc.pop(sentinel)
+
+                # Check if process exited with error
+                if proc.exitcode != 0:
+                    raise RuntimeError(
+                        f"Process {proc.name} (PID: {proc.pid}) "
+                        f"died with exit code {proc.exitcode}")
+
+            if actor_run_refs:
+                import ray
+                _, actor_run_refs = ray.wait(actor_run_refs, timeout=5)
+
+    except KeyboardInterrupt:
+        logger.info("Received KeyboardInterrupt, shutting down API servers...")
+    except Exception as e:
+        logger.exception("Exception occurred while running API servers: %s",
+                         str(e))
+        raise
+    finally:
+        logger.info("Terminating remaining processes ...")
+        api_server_manager.close()
+        if coordinator:
+            coordinator.close()
+        if engine_manager:
+            engine_manager.close()
+
+
 # Note(rob): shutdown function cannot be a bound method,
-# else the gc cannot collect the objedecoupct.
-def shutdown(procs: list[Process], input_address: str):
+# else the gc cannot collect the object.
+def shutdown(procs: list[BaseProcess]):
     # Shutdown the process.
     for proc in procs:
         if proc.is_alive():
@@ -185,12 +640,6 @@ def shutdown(procs: list[Process], input_address: str):
         if proc.is_alive() and (pid := proc.pid) is not None:
             kill_process_tree(pid)
 
-    # Remove zmq ipc socket files.
-    if input_address.startswith("ipc://"):
-        socket_file = input_address[len("ipc://"):]
-        if os and os.path.exists(socket_file):
-            os.remove(socket_file)
-
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 576086ebeb7f757085cc832cb0aa2e7f4d8d4fa1..8f4e8d64c615d46aed7bae91ed5cce61e9567243 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 import torch
@@ -104,17 +105,19 @@ class MultiGroupBlockTable:
 
     def __init__(self, max_num_reqs: int, max_model_len: int,
                  max_num_batched_tokens: int, pin_memory: bool,
-                 device: torch.device, block_size: int) -> None:
+                 device: torch.device, block_sizes: list[int]) -> None:
         self.block_tables = [
             BlockTable(max_num_reqs, cdiv(max_model_len, block_size),
                        max_num_batched_tokens, pin_memory, device)
+            for block_size in block_sizes
         ]
 
-    def append_row(self, block_ids: list[list[int]], row_idx: int) -> None:
+    def append_row(self, block_ids: tuple[list[int], ...],
+                   row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.append_row(block_ids[i], row_idx)
 
-    def add_row(self, block_ids: list[list[int]], row_idx: int) -> None:
+    def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.add_row(block_ids[i], row_idx)
 
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..607cfc0ef69cd738d6b928e646dbc8e7bd766d1a
--- /dev/null
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+from contextlib import contextmanager
+from typing import Any
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class CPUModelRunner(GPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
+
+        assert device == torch.device("cpu")
+        assert self.speculative_config is None, "spec decode is not supported."
+
+        self.use_cuda_graph = False
+        self.cascade_attn_enabled = False
+
+        self._postprocess_tenosrs()
+
+    def _postprocess_tenosrs(self) -> None:
+        # Note: replace device tensors with cpu tensors
+        def replace_tensor(obj: Any, cpu_attr_name: str,
+                           device_attr_name) -> None:
+            cpu_tensor = getattr(obj, cpu_attr_name, None)
+            device_tensor = getattr(obj, device_attr_name, None)
+            if cpu_tensor is not None and device_tensor is not None:
+                assert isinstance(cpu_tensor, torch.Tensor)
+                assert isinstance(device_tensor, torch.Tensor)
+                setattr(obj, device_attr_name, cpu_tensor)
+
+        for k, v in vars(self).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self, k, k[:-4])
+
+        for k, v in vars(self.input_batch).items():
+            if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch, k, k[:-11])
+
+        for k, v in vars(self.input_batch.block_table).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch.block_table, k, k[:-4])
+
+    def load_model(self) -> None:
+        logger.info("Starting to load model %s...", self.model_config.model)
+        self.model = get_model(vllm_config=self.vllm_config)
+
+        if self.lora_config:
+            self.model = self.load_lora_model(self.model, self.model_config,
+                                              self.scheduler_config,
+                                              self.lora_config, self.device)
+
+    def warming_up_model(self) -> None:
+        logger.info("Warming up model for the compilation...")
+        # Only generate graph for the generic shape
+        self._dummy_run(max(16, self.max_num_reqs))
+        logger.info("Warming up done.")
+
+    def _init_device_properties(self) -> None:
+        pass
+
+    def _sync_device(self) -> None:
+        pass
+
+
+@contextmanager
+def _set_global_compilation_settings():
+    import torch._inductor.config
+
+    # Note: The CPPGEMM backend requires freezing parameters.
+    freezing_value = torch._inductor.config.freezing
+    torch._inductor.config.freezing = True
+    # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects
+    # including object type dict"
+    force_disable_caches = torch._inductor.config.force_disable_caches
+    torch._inductor.config.force_disable_caches = True
+    yield
+    torch._inductor.config.freezing = freezing_value
+    torch._inductor.config.force_disable_caches = force_disable_caches
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a35e8812038673146ce21dc3a08d5355396357f
--- /dev/null
+++ b/vllm/v1/worker/cpu_worker.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from importlib import util
+from typing import Optional
+
+import torch
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.logger import init_logger
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import IntermediateTensors
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_worker import (Worker,
+                                       init_worker_distributed_environment)
+
+logger = init_logger(__name__)
+
+
+class CPUWorker(Worker):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 local_rank: int,
+                 rank: int,
+                 distributed_init_method: str,
+                 is_driver_worker: bool = False):
+        super().__init__(vllm_config,
+                         local_rank,
+                         rank,
+                         distributed_init_method,
+                         is_driver_worker=is_driver_worker)
+
+        self.parallel_config.disable_custom_all_reduce = True
+
+    def init_device(self):
+        # Setup OpenMP threads affinity.
+        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
+        self.local_omp_cpuid = "all"
+        if omp_cpuids == "auto":
+            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+            )
+        else:
+            self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+
+        if self.local_omp_cpuid != "all":
+            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            if ret:
+                logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank, "gloo")
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        self.model_runner: CPUModelRunner = CPUModelRunner(
+            self.vllm_config, torch.device("cpu"))
+
+    def sleep(self, level: int = 1) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def determine_available_memory(self) -> int:
+        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+
+    def compile_or_warm_up_model(self) -> None:
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+        self.model_runner.warming_up_model()
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+
+        output = self.model_runner.execute_model(scheduler_output,
+                                                 intermediate_tensors)
+
+        if not get_pp_group().is_last_rank:
+            assert isinstance(output, IntermediateTensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            return None
+
+        assert isinstance(output, ModelRunnerOutput)
+        return output if self.is_driver_worker else None
+
+    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = self.local_omp_cpuid
+        # Setup OpenMP thread affinity based on NUMA nodes automatically
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpu_count = psutil.cpu_count(logical=False)
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+            cpu_count_per_numa = cpu_count // numa_size
+            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                      cpu_count_per_numa // 2)
+
+            # check allow node_to_cpus list
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(list(node_intersect))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_to_cpus[self.rank][:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("auto thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b3e65917d3cc21d2bd62b65905c8f2ff73407d51..ebb770a7ddb29ca76aa2833992936b4790a9c2d8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
@@ -29,7 +30,7 @@ class CachedRequestState:
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 
-    block_ids: list[list[int]]
+    block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     output_token_ids: list[int]
 
@@ -55,14 +56,14 @@ class CachedRequestState:
 class InputBatch:
 
     def __init__(
-        self,
-        max_num_reqs: int,
-        max_model_len: int,
-        max_num_batched_tokens: int,
-        device: torch.device,
-        pin_memory: bool,
-        vocab_size: int,
-        block_size: int,
+            self,
+            max_num_reqs: int,
+            max_model_len: int,
+            max_num_batched_tokens: int,
+            device: torch.device,
+            pin_memory: bool,
+            vocab_size: int,
+            block_sizes: list[int],  # The block_size of each kv cache group
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -104,7 +105,7 @@ class InputBatch:
             max_num_batched_tokens=max_num_batched_tokens,
             pin_memory=pin_memory,
             device=device,
-            block_size=block_size,
+            block_sizes=block_sizes,
         )
 
         # Sampling-related.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 910c0e80bb31cba762ff7958220f4c89e47e03e9..b1bc727e1e8ea59b35ea97c730473e7480fd3fbe 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,16 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import gc
 import time
 import weakref
-from typing import TYPE_CHECKING, Optional, Union
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadataBuilder)
@@ -24,10 +27,11 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tp_group, graph_capture,
     prepare_communication_buffer_for_model)
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import (DPMetadata, get_forward_context,
+                                  set_forward_context)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.model_loader import TensorizerLoader, get_model
+from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -36,7 +40,6 @@ from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
                         check_use_alibi, is_pin_memory_available)
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
@@ -57,8 +60,8 @@ from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -141,40 +144,44 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
         self.attn_backends: list[type[AttentionBackend]] = []
         # self.kv_cache_config: KVCacheConfig
-        # self.input_batch: InputBatch # Persistent batch.
 
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
 
-        # Set up speculative decoding.
-        self.use_spec_decode = False
         self.use_aux_hidden_state_outputs = False
-        if self.speculative_config:
-            self.use_spec_decode = True
-
-            # NOTE(Jiayi): currently we put the entire draft model on
-            # the last PP rank. This is not ideal if there are many
-            # layers in the draft model.
-            if get_pp_group().is_last_rank:
-                if self.speculative_config.method == "ngram":
-                    self.drafter = NgramProposer(self.vllm_config)
-                elif self.speculative_config.use_eagle():
-                    self.drafter = EagleProposer(self.vllm_config, self.device,
-                                                 self)  # type: ignore
-                    if self.speculative_config.method == "eagle3":
-                        self.use_aux_hidden_state_outputs = True
-                elif self.speculative_config.method == "medusa":
-                    self.drafter = MedusaProposer(
-                        vllm_config=self.vllm_config,
-                        device=self.device)  # type: ignore
-                else:
-                    raise ValueError("Unknown speculative decoding method: "
-                                     f"{self.speculative_config.method}")
-                self.rejection_sampler = RejectionSampler()
+        # Set up speculative decoding.
+        # NOTE(Jiayi): currently we put the entire draft model on
+        # the last PP rank. This is not ideal if there are many
+        # layers in the draft model.
+        if self.speculative_config and get_pp_group().is_last_rank:
+            if self.speculative_config.method == "ngram":
+                self.drafter = NgramProposer(self.vllm_config)
+            elif self.speculative_config.use_eagle():
+                self.drafter = EagleProposer(self.vllm_config, self.device,
+                                             self)  # type: ignore
+                if self.speculative_config.method == "eagle3":
+                    self.use_aux_hidden_state_outputs = True
+            elif self.speculative_config.method == "medusa":
+                self.drafter = MedusaProposer(
+                    vllm_config=self.vllm_config,
+                    device=self.device)  # type: ignore
+            else:
+                raise ValueError("Unknown speculative decoding method: "
+                                 f"{self.speculative_config.method}")
+            self.rejection_sampler = RejectionSampler()
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        # Input Batch
+        # NOTE(Chen): Ideally, we should initialize the input batch inside
+        # `initialize_kv_cache` based on the kv cache config. However, as in
+        # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
+        # reasons, we have to initialize the input batch before `load_model`,
+        # quantization + weight offloading will fail otherwise. As a temporary
+        # solution, we initialize the input batch here, and re-initialize it
+        # in `initialize_kv_cache` if the block_sizes here is different from
+        # the block_sizes in the kv cache config.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
@@ -182,7 +189,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            block_size=self.cache_config.block_size,
+            block_sizes=[self.cache_config.block_size],
         )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
@@ -197,8 +204,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # Cache the device properties.
-        self.device_properties = torch.cuda.get_device_properties(self.device)
-        self.num_sms = self.device_properties.multi_processor_count
+        self._init_device_properties()
 
         # Persistent buffers for CUDA graphs.
         self.input_ids = torch.zeros(self.max_num_tokens,
@@ -278,6 +284,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         """
         Update the order of requests in the batch based on the attention
@@ -303,6 +315,17 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 self.input_batch, scheduler_output)
         return batch_reordered
 
+    # Note: used for model runner override.
+    def _init_device_properties(self) -> None:
+        """Initialize attributes from torch.cuda.get_device_properties
+        """
+        self.device_properties = torch.cuda.get_device_properties(self.device)
+        self.num_sms = self.device_properties.multi_processor_count
+
+    # Note: used for model runner override.
+    def _sync_device(self) -> None:
+        torch.cuda.synchronize()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -439,8 +462,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Update the block IDs.
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                for i in range(len(self.kv_cache_config.kv_cache_groups)):
-                    req_state.block_ids[i].extend(req_data.new_block_ids[i])
+                for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids,
+                        req_data.new_block_ids,
+                        strict=True):
+                    block_ids.extend(new_block_ids)
             else:
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
@@ -503,11 +529,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         if batch_changed or batch_reordered:
             self.input_batch.refresh_sampling_metadata()
 
+    def _get_cumsum_and_arange(
+        self,
+        num_tokens: np.ndarray,
+        cumsum_dtype: Optional[np.dtype] = None,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Get the cumulative sum and batched arange of the given array.
+        # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
+        # Equivalent to but faster than:
+        # np.concatenate([np.arange(n) for n in num_tokens])
+        """
+        # Step 1. [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
+        total_num_tokens = cu_num_tokens[-1]
+        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+        cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
+        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange = self.arange_np[:total_num_tokens] - cumsums_offsets
+
+        return cu_num_tokens, arange
+
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str, FlashAttentionMetadata], torch.Tensor,
-               Optional[SpecDecodeMetadata]]:
+    ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -528,17 +573,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         req_indices = np.repeat(self.arange_np[:num_reqs],
                                 num_scheduled_tokens)
 
-        # Get batched arange.
-        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        # Equivalent to but faster than:
-        # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
-        # Step 1. [2, 5, 3] -> [2, 7, 10]
-        cu_num_tokens = np.cumsum(num_scheduled_tokens)
-        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
-        cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
-                                    num_scheduled_tokens)
-        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
+        # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
+        # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        cu_num_tokens, arange = self._get_cumsum_and_arange(
+            num_scheduled_tokens)
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]
@@ -619,7 +657,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # Fill unused with -1. Needed for reshape_and_cache
         self.seq_lens[num_reqs:].fill_(0)
-        self.query_start_loc[num_reqs + 1:].fill_(-1)
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc[num_reqs + 1:].fill_(
+            self.query_start_loc_cpu[num_reqs].item())
 
         query_start_loc = self.query_start_loc[:num_reqs + 1]
         seq_lens = self.seq_lens[:num_reqs]
@@ -627,7 +668,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=query_start_loc, seq_lens=seq_lens)
 
-        attn_metadata: dict[str, FlashAttentionMetadata] = {}
+        attn_metadata: dict[str, Any] = {}
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -844,32 +885,25 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # Compute the logits indices.
         # [4, 1, 3, 1, 2]
         num_sampled_tokens = num_draft_tokens + 1
-        # Step 1. [4, 5, 8, 9, 11]
-        cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32)
-        total_num_sampled_tokens = cu_num_sampled_tokens[-1]
-        # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
-        cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens,
-                                    num_sampled_tokens)
-        # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
-        arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets
-        # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
+
+        # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
+        # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+        cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
+            num_sampled_tokens, cumsum_dtype=np.int32)
+        # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
         logits_indices = np.repeat(
             cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
-        # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+        # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
         logits_indices += arange
 
         # Compute the bonus logits indices.
         bonus_logits_indices = cu_num_sampled_tokens - 1
 
         # Compute the draft logits indices.
-        # [3, 3, 5, 5, 6]
-        cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
-        total_num_draft_tokens = cu_num_draft_tokens[-1]
-        # [0, 0, 0, 3, 3, 5]
-        cumsums_offsets = np.repeat(cu_num_draft_tokens - num_draft_tokens,
-                                    num_draft_tokens)
-        # [0, 1, 2, 0, 1, 0]
-        arange = self.arange_np[:total_num_draft_tokens] - cumsums_offsets
+        # cu_num_draft_tokens: [3, 3, 5, 5, 6]
+        # arange: [0, 1, 2, 0, 1, 0]
+        cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
+            num_draft_tokens, cumsum_dtype=np.int32)
         # [0, 0, 0, 5, 5, 9]
         target_logits_indices = np.repeat(
             cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
@@ -928,10 +962,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         encoder_outputs = []
         for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.batch(
+                grouped_mm_inputs, pin_memory=self.pin_memory)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1108,6 +1142,31 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             for k, v in self.intermediate_tensors.items()
         })
 
+    def get_dp_padding(self,
+                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+
+        # For DP: Don't pad when setting enforce_eager.
+        # This lets us set enforce_eager on the prefiller in a P/D setup and
+        # still use CUDA graphs (enabled by this padding) on the decoder.
+        #
+        # TODO(tms) : There are many cases where padding is enabled for
+        # prefills, causing unnecessary and excessive padding of activations.
+
+        if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
+            # Early exit.
+            return 0, None
+
+        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+            num_tokens, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
+        num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
+                                                dp_size,
+                                                device="cpu",
+                                                dtype=torch.int32)
+        return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1145,6 +1204,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             else:
                 num_input_tokens = num_scheduled_tokens
 
+        # Padding for DP
+        num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
+        num_input_tokens += num_pad
+
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
         if self.is_multimodal_model:
@@ -1190,7 +1253,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata,
                                  self.vllm_config,
-                                 num_tokens=num_input_tokens):
+                                 num_tokens=num_input_tokens,
+                                 num_tokens_across_dp=num_tokens_across_dp):
             self.maybe_setup_kv_connector(scheduler_output)
 
             model_output = self.model(
@@ -1318,7 +1382,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         for i in discard_sampled_tokens_req_indices:
             valid_sampled_token_ids[i].clear()
 
-        if not self.use_spec_decode:
+        if not self.speculative_config:
             # Speculative decoding is not enabled.
             spec_token_ids = None
         elif self.speculative_config.method == "ngram":
@@ -1531,7 +1595,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             time_before_load = time.perf_counter()
-            self.model = get_model(vllm_config=self.vllm_config)
+            model_loader = get_model_loader(self.load_config)
+            if not hasattr(self, "model"):
+                logger.info("Loading model from scratch...")
+                self.model = model_loader.load_model(
+                    vllm_config=self.vllm_config,
+                    model_config=self.model_config)
+            else:
+                logger.info(
+                    "Model was already initialized. Loading weights inplace..."
+                )
+                model_loader.load_weights(self.model,
+                                          model_config=self.model_config)
             if self.lora_config:
                 self.model = self.load_lora_model(self.model,
                                                   self.model_config,
@@ -1651,10 +1726,39 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # Must synchronize the non-blocking GPU->CPU transfers.
         if prompt_logprobs_dict:
-            torch.cuda.synchronize()
+            self._sync_device()
 
         return prompt_logprobs_dict
 
+    @contextmanager
+    def maybe_randomize_inputs(self, input_ids: torch.Tensor):
+        """
+        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
+        This is to help balance expert-selection
+         - during profile_run
+         - during DP rank dummy run 
+        """
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
+        if not randomize_inputs:
+            yield
+        else:
+            import functools
+
+            @functools.cache
+            def rand_input_ids() -> torch.Tensor:
+                return torch.randint_like(
+                    self.input_ids,
+                    low=0,
+                    high=self.model_config.get_vocab_size(),
+                    dtype=input_ids.dtype)
+
+            logger.debug("Randomizing dummy data for DP Rank")
+            input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
+                            non_blocking=True)
+            yield
+            input_ids.fill_(0)
+
     @torch.inference_mode()
     def _dummy_run(
         self,
@@ -1662,12 +1766,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         skip_attn: bool = True,
     ) -> torch.Tensor:
 
+        # Padding for DP
+        num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
+        num_tokens += num_pad
+
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens
+        num_reqs = min(num_tokens, max_num_reqs)
         min_tokens_per_req = num_tokens // num_reqs
         num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
         num_scheduled_tokens_list[-1] += num_tokens % num_reqs
@@ -1677,9 +1785,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                         dtype=np.int32)
 
         if skip_attn:
-            attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None
+            attn_metadata: Optional[dict[str, Any]] = None
         else:
             query_start_loc = self.query_start_loc[:num_reqs + 1]
+            # Make sure max_model_len is used at the graph capture time.
+            self.seq_lens_np[:num_reqs] = self.max_model_len
+            self.seq_lens_np[num_reqs:] = 0
+            self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
+                                           non_blocking=True)
             seq_lens = self.seq_lens[:num_reqs]
 
             common_attn_metadata = CommonAttentionMetadata(
@@ -1690,7 +1803,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     self.kv_cache_config.kv_cache_groups):
                 attn_metadata_i = (
                     self.attn_metadata_builders[kv_cache_group_id].build(
-                        num_reqs=num_tokens,
+                        num_reqs=num_reqs,
                         num_actual_tokens=num_tokens,
                         max_query_len=num_tokens,
                         common_prefix_len=0,
@@ -1726,9 +1839,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                     num_tokens, None, False)
 
-            with set_forward_context(attn_metadata,
-                                     self.vllm_config,
-                                     num_tokens=num_tokens):
+            with self.maybe_randomize_inputs(input_ids), set_forward_context(
+                    attn_metadata,
+                    self.vllm_config,
+                    num_tokens=num_tokens,
+                    num_tokens_across_dp=num_tokens_across_dp):
                 outputs = model(
                     input_ids=input_ids,
                     positions=positions,
@@ -1740,7 +1855,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             else:
                 hidden_states = outputs
 
-            if self.use_spec_decode and self.speculative_config.use_eagle():
+            if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 self.drafter.dummy_run(num_tokens)
 
@@ -1795,7 +1910,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     "initializing the engine.") from e
             else:
                 raise e
-        if self.use_spec_decode:
+        if self.speculative_config:
             draft_token_ids = [[0] for _ in range(num_reqs)]
             dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
                 draft_token_ids, self.device)
@@ -1875,10 +1990,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             ).multi_modal_data
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs] * max_num_mm_items)
+                [dummy_mm_kwargs] * max_num_mm_items,
+                pin_memory=self.pin_memory)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1899,7 +2014,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             sampler_output = self._dummy_sampler_run(hidden_states)
         else:
             sampler_output = None
-        torch.cuda.synchronize()
+        self._sync_device()
         del hidden_states, sampler_output
         self.encoder_cache.clear()
         gc.collect()
@@ -1983,59 +2098,168 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
 
-    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+    def may_reinitialize_input_batch(self,
+                                     kv_cache_config: KVCacheConfig) -> None:
         """
-        Initialize KV cache based on `kv_cache_config`.
+        Re-initialize the input batch if the block sizes are different from
+        `[self.cache_config.block_size]`. This usually happens when there
+        are multiple KV cache groups.
+
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV
-            cache size of each layer
+            kv_cache_config: The KV cache configuration.
         """
-        if len(kv_cache_config.kv_cache_groups) > 1:
-            raise NotImplementedError(
-                "Hybrid models with more than one KV cache type are not "
-                "supported yet.")
-        self.kv_cache_config = kv_cache_config
-        self.initialize_attn_backend(kv_cache_config)
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+        if block_sizes != [self.cache_config.block_size]:
+            assert self.cache_config.cpu_offload_gb == 0, (
+                "Cannot re-initialize the input batch when CPU weight "
+                "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                "for more details.")
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_sizes=block_sizes,
+            )
 
-        kv_caches: dict[str, torch.Tensor] = {}
+    def _allocate_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initializes the KV cache buffer with the correct size. The buffer needs
+        to be reshaped to the desired shape before being used by the models.
+
+        Args:
+            kv_cache_config: The KV cache config 
+        Returns:
+            dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+         """
+        kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            tensor = torch.zeros(kv_cache_tensor.size,
+                                 dtype=torch.int8,
+                                 device=self.device)
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_cache_raw_tensors[layer_name] = tensor
+
+        layer_names = set()
+        for group in kv_cache_config.kv_cache_groups:
+            layer_names.update(group.layer_names)
+        assert layer_names == set(kv_cache_raw_tensors.keys(
+        )), "Some layers are not correctly initialized"
+        return kv_cache_raw_tensors
+
+    def _reshape_kv_cache_tensors(
+        self,
+        kv_cache_config: KVCacheConfig,
+        kv_cache_raw_tensors: dict[str, torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        """
+        Reshape the KV cache tensors to the desired shape and dtype.
 
-        for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group.kv_cache_spec
-            for layer_name in kv_cache_group.layer_names:
-                tensor_config = kv_cache_config.tensors[layer_name]
-                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                # `num_blocks` is the number of blocks the model runner can use.
-                # `kv_cache_config.num_blocks` is the number of blocks that
-                # KVCacheManager may allocate.
-                # Since different GPUs may have different number of layers and
-                # different memory capacities, `num_blocks` can be different on
-                # different GPUs, and `kv_cache_config.num_blocks` is set to
-                # the min of all `num_blocks`. Verify it here.
-                assert num_blocks >= kv_cache_config.num_blocks
+        Args:
+            kv_cache_config: The KV cache config 
+            kv_cache_raw_tensors: The KV cache buffer of each layer, with 
+            correct size but uninitialized shape.
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+        """
+        kv_caches: dict[str, torch.Tensor] = {}
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            for layer_name in kv_cache_group_spec.layer_names:
+                raw_tensor = kv_cache_raw_tensors[layer_name]
+                assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+                num_blocks = (raw_tensor.numel() //
+                              kv_cache_spec.page_size_bytes)
                 if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
-                    kv_caches[layer_name] = torch.zeros(kv_cache_shape,
-                                                        dtype=dtype,
-                                                        device=self.device)
+                    try:
+                        kv_cache_stride_order = self.attn_backends[
+                            i].get_kv_cache_stride_order()
+                        assert len(kv_cache_stride_order) == len(
+                            kv_cache_shape)
+                    except (AttributeError, NotImplementedError):
+                        kv_cache_stride_order = tuple(
+                            range(len(kv_cache_shape)))
+                    # The allocation respects the backend-defined stride order
+                    # to ensure the semantic remains consistent for each
+                    # backend. We first obtain the generic kv cache shape and
+                    # then permute it according to the stride order which could
+                    # result in a non-contiguous tensor.
+                    kv_cache_shape = tuple(kv_cache_shape[i]
+                                           for i in kv_cache_stride_order)
+                    # Maintain original KV shape view.
+                    inv_order = [
+                        kv_cache_stride_order.index(i)
+                        for i in range(len(kv_cache_stride_order))
+                    ]
+                    kv_caches[layer_name] = kv_cache_raw_tensors[
+                        layer_name].view(dtype).view(kv_cache_shape).permute(
+                            *inv_order)
                 else:
-                    # TODO: add new branches when introducing more types of
-                    # KV cache specs.
-                    raise ValueError("Unknown KV cache spec type.")
+                    raise NotImplementedError
+        return kv_caches
 
-        if self.speculative_config and self.speculative_config.use_eagle():
-            assert isinstance(self.drafter, EagleProposer)
-            # validate all draft model layers belong to the same kv cache
-            # group
-            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+    def initialize_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initialize the memory buffer for KV cache.
+
+        Args:
+            kv_cache_config: The KV cache config
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+        """
+        # Initialize the memory buffer for KV cache
+        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+        # Change the memory buffer to the desired shape
+        kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
+                                                   kv_cache_raw_tensors)
+
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
 
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
+        return kv_caches
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV
+            cache size of each layer
+        """
+        self.kv_cache_config = kv_cache_config
+        self.may_reinitialize_input_batch(kv_cache_config)
+        self.initialize_attn_backend(kv_cache_config)
+        kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
+
+        if self.speculative_config and self.speculative_config.use_eagle():
+            assert isinstance(self.drafter, EagleProposer)
+            # validate all draft model layers belong to the same kv cache
+            # group
+            self.drafter.validate_same_kv_cache_group(kv_cache_config)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -2054,6 +2278,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
             # TODO: Support other attention modules, e.g., cross-attention
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index bce5cbb5f9d0e8891f7dc2e148e06a0be0202d91..b7d244f270457879b07bf43a5be21f76ee71abe0 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 import gc
 import os
@@ -21,7 +22,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import GiB_bytes
+from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import report_usage_stats
@@ -129,7 +130,22 @@ class Worker(WorkerBase):
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+
+            # take current memory snapshot
+            self.init_snapshot = MemorySnapshot()
+            self.requested_memory = (self.init_snapshot.total_memory *
+                                     self.cache_config.gpu_memory_utilization)
+            if self.init_snapshot.free_memory < self.requested_memory:
+                GiB = lambda b: round(b / GiB_bytes, 2)
+                raise ValueError(
+                    f"Free memory on device "
+                    f"({GiB(self.init_snapshot.free_memory)}/"
+                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
+                    f"is less than desired GPU memory utilization "
+                    f"({self.cache_config.gpu_memory_utilization}, "
+                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
+                    f"utilization or reduce GPU memory used by other processes."
+                )
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -178,38 +194,39 @@ class Worker(WorkerBase):
         """
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
+        GiB = lambda b: b / GiB_bytes
 
-        _, total_gpu_memory = torch.cuda.mem_get_info()
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
+        with memory_profiling(
+                self.init_snapshot,
+                weights_memory=int(
+                    self.model_runner.model_memory_usage)) as profile_result:
+            self.model_runner.profile_run()
 
-        free_gpu_memory, _ = torch.cuda.mem_get_info()
+        free_gpu_memory = profile_result.after_profile.free_memory
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        assert self.init_gpu_memory > free_gpu_memory, (
+        assert self.init_snapshot.free_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
-
-        # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
+            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
+            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            "This happens when other processes sharing the same container "
+            "release GPU memory while vLLM is profiling during initialization. "
+            "To fix this, ensure consistent GPU memory allocation or "
+            "isolate vLLM in its own container.")
+        available_kv_cache_memory = self.requested_memory \
+            - profile_result.non_kv_cache_memory
+
+        logger.debug(
+            "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
+            "requested GPU memory: %.2f GiB",
+            GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
+            GiB(self.requested_memory))
+        logger.debug(profile_result)
+        logger.info("Available KV cache memory: %.2f GiB",
+                    GiB(available_kv_cache_memory))
+        gc.collect()
 
         return int(available_kv_cache_memory)
 
@@ -292,6 +309,8 @@ class Worker(WorkerBase):
             self.profiler.start()
         else:
             self.profiler.stop()
+            print(self.profiler.key_averages().table(
+                sort_by="self_cuda_time_total"))
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1)
@@ -339,13 +358,14 @@ def init_worker_distributed_environment(
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
+    backend: str = "nccl",
 ) -> None:
     """Initialize the distributed environment."""
     parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank)
+                                 distributed_init_method, local_rank, backend)
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 3cbab840e9693784cad601a58d6cc17974d6bfc0..afa41a37eeb34fb801ae5abef335d018daa06c9f 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Define LoRA functionality mixin for model runners.
 """
@@ -80,8 +81,38 @@ class LoRAModelRunnerMixin:
                                       lora_requests)
 
     @contextmanager
-    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
-                                  num_scheduled_tokens: np.ndarray):
+    def maybe_setup_dummy_loras(self, lora_config):
+        if lora_config is None:
+            yield
+        else:
+            # __enter__ code
+            assert self.lora_manager is not None, "LoRA is not enabled"
+
+            num_loras = lora_config.max_loras
+
+            # Make dummy lora requests
+            lora_requests: set[LoRARequest] = {
+                LoRARequest(lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path")
+                for lora_id in range(1, num_loras + 1)
+            }
+
+            with self.lora_manager.dummy_lora_cache():
+                # Add the dummy LoRAs here so _set_active_loras doesn't try to
+                # load from disk.
+                for lr in lora_requests:
+                    self.lora_manager.add_dummy_lora(
+                        lr, rank=self.LORA_WARMUP_RANK)
+
+                yield
+
+            # __exit__ code
+            self.lora_manager.remove_all_adapters()
+
+    @contextmanager
+    def maybe_select_dummy_loras(self, lora_config: LoRAConfig,
+                                 num_scheduled_tokens: np.ndarray):
         if lora_config is None:
             yield
         else:
@@ -108,21 +139,18 @@ class LoRAModelRunnerMixin:
                 for lora_id in range(1, num_loras + 1)
             }
 
-            with self.lora_manager.dummy_lora_cache():
-                # Add the dummy LoRAs here so _set_active_loras doesn't try to
-                # load from disk.
-                for lr in lora_requests:
-                    self.lora_manager.add_dummy_lora(
-                        lr, rank=self.LORA_WARMUP_RANK)
-
-                self._set_active_loras(tuple(prompt_lora_mapping),
-                                       tuple(token_lora_mapping),
-                                       lora_requests)
+            self._set_active_loras(tuple(prompt_lora_mapping),
+                                   tuple(token_lora_mapping), lora_requests)
 
-                yield
+            yield
 
-            # __exit__ code
-            self.lora_manager.remove_all_adapters()
+    @contextmanager
+    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
+                                  num_scheduled_tokens: np.ndarray):
+        with self.maybe_setup_dummy_loras(
+                lora_config), self.maybe_select_dummy_loras(
+                    lora_config, num_scheduled_tokens):
+            yield
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 46bcf64ed0c395947b9d146813fe1fad605ee67f..d5f40e4d3103c14c4c9591f31453138d8fcc0061 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import bisect
 import gc
 import time
@@ -7,26 +8,29 @@ from unittest.mock import patch
 
 import numpy as np
 import torch
-import torch.distributed
 import torch.nn as nn
 # TPU XLA related
 import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config import ParallelConfig, VllmConfig, get_layers_from_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader import get_model
+from vllm.lora.layers import BaseLayerWithLoRA
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.model_executor.model_loader.tpu import TPUModelLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
                                     PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sequence import IntermediateTensors
-from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -41,7 +45,8 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -97,6 +102,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self,
         vllm_config: VllmConfig,
         device: torch.device,
+        original_parallel_config: Optional[ParallelConfig] = None,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -104,6 +110,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.lora_config = vllm_config.lora_config
         self.load_config = vllm_config.load_config
         self.parallel_config = vllm_config.parallel_config
+        self.original_parallel_config = original_parallel_config
         self.scheduler_config = vllm_config.scheduler_config
         self.speculative_config = vllm_config.speculative_config
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
@@ -117,6 +124,14 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.device = device
         self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
 
+        # SPMD Related
+        self.use_spmd = envs.VLLM_XLA_USE_SPMD
+        if self.use_spmd:
+            num_devices = xr.global_runtime_device_count()
+            mesh_shape = (num_devices, 1)
+            device_ids = np.array(range(num_devices))
+            self.mesh = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+
         self.enforce_eager = model_config.enforce_eager
 
         self.num_xla_graphs = 0
@@ -124,6 +139,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
 
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+        if cache_config.cache_dtype == "auto":
+            self.kv_cache_dtype = self.dtype
+        else:
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                cache_config.cache_dtype]
         self._hidden_states_dtype = self.dtype
 
         self.is_multimodal_model = model_config.is_multimodal_model
@@ -152,6 +172,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.hidden_size = model_config.get_hidden_size()
         self.vocab_size = model_config.get_vocab_size()
 
+        if self.lora_config is not None:
+            self.vocab_size += self.lora_config.lora_extra_vocab_size
+
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
@@ -167,15 +190,25 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.encoder_cache_size = encoder_cache_size
 
         # Lazy initialization
-        # self.model: nn.Module  # Set after load_model
+        self.model: nn.Module  # Set after load_model
         self.kv_caches: list[torch.Tensor] = []
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
-        # self.input_batch: InputBatch  # Persistent batch.
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        # Initialize input batch early to avoid AttributeError in _update_states
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_sizes=[self.block_size],
+        )
+
         # Cached torch/numpy tensor
         # The pytorch tensor and numpy array share the same buffer.
         # Sometimes the numpy op is faster so we create both.
@@ -212,6 +245,12 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
         # tensors for structured decoding
         self.grammar_bitmask_cpu = torch.zeros(
             (self.max_num_reqs, cdiv(self.vocab_size, 32)),
@@ -257,6 +296,15 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                        max_num_mm_items_decoder_budget)
                 self.max_num_mm_items_by_modality[modality] = max_num_mm_items
 
+        if not self.use_spmd:
+            self.sample_from_logits_func = torch.compile(
+                self.sample_from_logits,
+                backend="openxla",
+                fullgraph=True,
+                dynamic=False)
+        else:
+            self.sample_from_logits_func = self.sample_from_logits
+
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
         if not check_comp:
@@ -365,7 +413,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             req_state.num_computed_tokens = req_data.num_computed_tokens
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                req_state.block_ids.extend(req_data.new_block_ids)
+                for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids,
+                        req_data.new_block_ids,
+                        strict=True):
+                    block_ids.extend(new_block_ids)
             else:
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
@@ -405,7 +457,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
 
     def get_model(self) -> nn.Module:
-        assert self.model is not None
         return self.model
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
@@ -421,13 +472,25 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
-                        dtype=attn_module.dtype,
+                        dtype=self.kv_cache_dtype,
                         sliding_window=attn_module.sliding_window,
                         use_mla=False,
                     )
@@ -436,7 +499,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
-                        dtype=attn_module.dtype,
+                        dtype=self.kv_cache_dtype,
                         use_mla=False,
                     )
             elif attn_module.attn_type in (AttentionType.ENCODER,
@@ -591,6 +654,17 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
         logits_indices = logits_indices.to(self.device)
 
+        if self.lora_config is not None:
+            # We need to respect padding when activating LoRA adapters
+            padded_num_scheduled_tokens_per_req = np.copy(
+                num_scheduled_tokens_per_req
+            )  # Copying to avoid accidental state corruption bugs
+            padded_num_scheduled_tokens_per_req[-1] += \
+                padded_total_num_scheduled_tokens - total_num_scheduled_tokens
+
+            self.set_active_loras(self.input_batch,
+                                  padded_num_scheduled_tokens_per_req)
+
         layer_names = get_layers_from_vllm_config(self.vllm_config,
                                                   Attention).keys()
         per_layer_attn_metadata = {
@@ -654,7 +728,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -801,9 +874,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             logits = self.structured_decode(require_struct_decoding,
                                             grammar_bitmask_padded, logits,
                                             arange)
-        selected_token_ids = self.sample_from_logits(logits,
-                                                     tpu_sampling_metadata)
-
+        selected_token_ids = self.sample_from_logits_func(
+            logits, tpu_sampling_metadata)
         # NOTE (NickLucche) Use the original logits (before any penalties or
         # temperature scaling) for the top-k logprobs. We can't enforce it due
         # to recompilations outside torch.compiled code, so just make sure
@@ -911,17 +983,38 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
-            model = get_model(vllm_config=self.vllm_config)
+            if self.use_spmd:
+                tpu_loader = TPUModelLoader(
+                    load_config=self.vllm_config.load_config)
+                model = tpu_loader.load_model(
+                    vllm_config=self.vllm_config,
+                    model_config=self.vllm_config.model_config,
+                    mesh=self.mesh)
+            else:
+                # model = get_model(vllm_config=self.vllm_config)
+                model_loader = get_model_loader(self.load_config)
+                if not hasattr(self, "model"):
+                    logger.info("Loading model from scratch...")
+                    model = model_loader.load_model(
+                        vllm_config=self.vllm_config,
+                        model_config=self.model_config)
+                else:
+                    logger.info("Model was already initialized. \
+                            Loading weights inplace...")
+                    model_loader.load_weights(self.model,
+                                              model_config=self.model_config)
         if self.lora_config is not None:
             model = self.load_lora_model(model, self.model_config,
                                          self.scheduler_config,
                                          self.lora_config, self.device)
+            replace_set_lora(model)
 
         # Sync all pending XLA execution during model initialization and weight
         # loading.
         xm.mark_step()
         xm.wait_device_ops()
-        self.model = model
+        if not hasattr(self, "model"):
+            self.model = model
         self.sampler = TPUSampler()
 
     @torch.no_grad()
@@ -933,31 +1026,25 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                         device=self.device)
         else:
             input_ids = torch.zeros((num_tokens),
-                                    dtype=torch.int32,
-                                    device=self.device)
+                                    dtype=torch.int32).to(self.device)
             inputs_embeds = None
         actual_num_reqs = min(num_tokens, self.max_num_reqs)
         position_ids = torch.zeros(num_tokens,
-                                   dtype=torch.int32,
-                                   device=self.device)
+                                   dtype=torch.int32).to(self.device)
         slot_mapping = torch.zeros(num_tokens,
-                                   dtype=torch.int64,
-                                   device=self.device)
+                                   dtype=torch.int64).to(self.device)
         block_tables = torch.zeros(
             (self.max_num_reqs, self.block_table_cpu.shape[1]),
-            dtype=torch.int32,
-            device=self.device)
+            dtype=torch.int32).to(self.device)
         query_lens = [1] * self.max_num_reqs
         query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
                                                     dtype=torch.int32),
                                        dim=0,
                                        dtype=torch.int32).to(self.device)
         context_lens = torch.ones((self.max_num_reqs, ),
-                                  dtype=torch.int32,
-                                  device=self.device)
+                                  dtype=torch.int32).to(self.device)
         num_seqs = torch.tensor([actual_num_reqs],
-                                dtype=torch.int32,
-                                device=self.device)
+                                dtype=torch.int32).to(self.device)
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
             block_tables=block_tables,
@@ -980,7 +1067,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             for layer_name in layer_names
         }
 
-        with self.maybe_dummy_run_with_lora(
+        with self.maybe_select_dummy_loras(
                 self.lora_config,
                 np.array([num_tokens], dtype=np.int32)), set_forward_context(
                     per_layer_attn_metadata, self.vllm_config, 0):
@@ -989,6 +1076,13 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                              inputs_embeds=inputs_embeds)
         self._hidden_states_dtype = out.dtype
 
+    def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping,
+                          lora_requests) -> None:
+        xm.mark_step()  # Captures input updates
+        super()._set_active_loras(prompt_lora_mapping, token_lora_mapping,
+                                  lora_requests)
+        xm.mark_step()  # Captures metadata updates
+
     def _precompile_mm_encoder(self) -> None:
         # Pre-compile MM encoder for all supported data modalities.
         hf_config = self.vllm_config.model_config.hf_config
@@ -1151,7 +1245,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                         generate_params_if_all_greedy,
                     ))
                 sampling_metadata.all_greedy = all_greedy
-                self.sample_from_logits(dummy_logits, sampling_metadata)
+                with self.maybe_select_dummy_loras(
+                        self.lora_config, np.array([num_reqs],
+                                                   dtype=np.int32)):
+                    self.sample_from_logits_func(dummy_logits,
+                                                 sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1167,7 +1265,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                        dtype=self._hidden_states_dtype)
             dummy_tokens = torch.zeros((num_reqs, 1),
                                        dtype=torch.int64).to(self.device)
-            self.gather_logprobs(dummy_logits, dummy_tokens)
+            with self.maybe_select_dummy_loras(
+                    self.lora_config, np.array([num_reqs], dtype=np.int32)):
+                self.gather_logprobs(dummy_logits, dummy_tokens)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -1178,13 +1278,14 @@ class TPUModelRunner(LoRAModelRunnerMixin):
         """
         Precompile all the subgraphs with possible input shapes.
         """
-        self._precompile_mm_encoder()
-        self._precompile_backbone()
-        self._precompile_select_hidden_states()
-        self._precompile_compute_logits()
-        self._precompile_structured_decoding()
-        self._precompile_sample_from_logits()
-        self._precompile_gather_logprobs()
+        with self.maybe_setup_dummy_loras(self.lora_config):
+            self._precompile_mm_encoder()
+            self._precompile_backbone()
+            self._precompile_select_hidden_states()
+            self._precompile_compute_logits()
+            self._precompile_structured_decoding()
+            self._precompile_sample_from_logits()
+            self._precompile_gather_logprobs()
 
     def profile_run(
         self,
@@ -1257,46 +1358,78 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
-        self.input_batch = InputBatch(
-            max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
-            max_num_batched_tokens=self.max_num_tokens,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            vocab_size=self.model_config.get_vocab_size(),
-            block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.
-            block_size,
-        )
+        if kv_cache_config.kv_cache_groups[
+                0].kv_cache_spec.block_size != self.block_size:
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_sizes=[
+                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+                ],
+            )
+        # Verify dtype compatibility between block_table_cpu and input_batch
         assert self.block_table_cpu.dtype == self.input_batch.block_table[
             0].get_cpu_tensor().dtype
 
-        kv_caches: dict[str, torch.Tensor] = {}
+        kv_cache_sizes = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            assert len(kv_cache_tensor.shared_by) == 1, (
+                "KV cache tensor shared by multiple layers is not supported in "
+                "TPU.")
+            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
+        kv_caches: dict[str, torch.Tensor] = {}
         for kv_cache_group in kv_cache_config.kv_cache_groups:
             kv_cache_spec = kv_cache_group.kv_cache_spec
             for layer_name in kv_cache_group.layer_names:
-                tensor_config = kv_cache_config.tensors[layer_name]
-                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
+                tensor_size = kv_cache_sizes[layer_name]
+                assert tensor_size % kv_cache_spec.page_size_bytes == 0
+                num_blocks = tensor_size // kv_cache_spec.page_size_bytes  # noqa
                 if isinstance(kv_cache_spec, AttentionSpec):
+                    if self.use_spmd:
+                        num_kv_heads = kv_cache_spec.num_kv_heads
+                        assert self.original_parallel_config is not None
+                        tp_size = \
+                            self.original_parallel_config.tensor_parallel_size
+                        # TODO: Handle kv cache duplication under SPMD mode.
+                        assert num_kv_heads % tp_size == 0, (
+                            f"num_kv_heads {num_kv_heads} must be divisible by "
+                            f"tp_size {tp_size} under SPMD mode")
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
 
                     tpu_kv_cache = torch.zeros(kv_cache_shape,
-                                               dtype=dtype,
-                                               device=self.device)
+                                               dtype=dtype).to(self.device)
 
                     kv_caches[layer_name] = tpu_kv_cache
                 else:
                     raise NotImplementedError
 
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
+
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
 
+        if self.use_spmd:
+            # Shard KV Cache
+            for cache in self.kv_caches:
+                xs.mark_sharding(cache, self.mesh, (None, 'x', None, None))
+
     def reset_dynamo_cache(self):
         if self.is_multimodal_model:
             compiled_model = self.model.get_language_model().model
@@ -1317,7 +1450,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                        sample_hidden_states: torch.Tensor) -> torch.Tensor:
         return self.model.compute_logits(sample_hidden_states, None)
 
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    # TODO: Under SPMD mode, sample_from_logits has correctness issue.
+    #       Re-enable the torch.compile once the issue is fixed in torchxla.
+    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_logits(
             self, logits: torch.Tensor,
             sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
@@ -1440,7 +1575,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                                          batch_size)
         return MultiModalKwargs.as_kwargs(
             batched_dummy_mm_inputs,
-            dtype=self.model_config.dtype,
             device=self.device,
         )
 
@@ -1467,11 +1601,11 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
                         padding_gap: int) -> list[int]:
     """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
-    
+
     If padding_gap == 0 then:
         increase 2X each time (exponential)
     else:
-        first increase the size to twice, 
+        first increase the size to twice,
         then increase the padding size by padding_gap.
     """
     # assert min_token_size is power of 2
@@ -1508,3 +1642,32 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int:
     index = bisect.bisect_left(paddings, x)
     assert index < len(paddings)
     return paddings[index]
+
+
+def replace_set_lora(model):
+
+    def _tpu_set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        # TODO: The integer index leads to a recompilation, but converting it
+        # to a tensor doesn't seem to work anymore. This might be fixed with a
+        # later release of torch_xla.
+        self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias)
+        xm.mark_step()
+
+    def _tpu_reset_lora(self, index: int):
+        self._original_reset_lora(index)
+        xm.mark_step()
+
+    for _, module in model.named_modules():
+        if isinstance(module, BaseLayerWithLoRA):
+            module._original_set_lora = module.set_lora
+            module._original_reset_lora = module.reset_lora
+            module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
+            module.reset_lora = _tpu_reset_lora.__get__(
+                module, module.__class__)
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index fa4eb30ccd9a687cfe93afeb16839bddfb9a87cd..5da481baeeea7868f1d661d9312a0ab043b99b47 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A TPU worker class."""
 import os
 from typing import Optional
@@ -45,6 +46,15 @@ class TPUWorker:
         self.lora_config = vllm_config.lora_config
         self.load_config = vllm_config.load_config
         self.parallel_config = vllm_config.parallel_config
+        self.use_spmd = envs.VLLM_XLA_USE_SPMD
+        self.original_parallel_config = None
+        if self.use_spmd:
+            # Under SPMD mode, distributed env is initialized as if there is
+            # only one worker/device.
+            self.original_parallel_config = self.parallel_config
+            self.parallel_config.tensor_parallel_size = 1
+            self.parallel_config.pipeline_parallel_size = 1
+            self.parallel_config.world_size = 1
         self.scheduler_config = vllm_config.scheduler_config
         self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config
@@ -83,10 +93,6 @@ class TPUWorker:
         if self.model_config.seed is None:
             self.model_config.seed = 0
 
-        if vllm_config.lora_config is not None:
-            raise NotImplementedError(
-                "The V1 TPU backend doesn't support LoRA serving")
-
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
         # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
@@ -94,15 +100,18 @@ class TPUWorker:
         # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
-            "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            os.environ.get("LIBTPU_INIT_ARGS", "") +
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False")
+        # --xla_jf_conv_input_fusion=False is used to improve the perf of
+        # quantized matmul.
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
         # Initialize the distributed environment.
-        init_tpu_worker_distributed_environment(self.parallel_config,
-                                                self.rank,
-                                                self.distributed_init_method,
-                                                self.local_rank)
+        self._init_tpu_worker_distributed_environment(
+            self.parallel_config, self.rank, self.distributed_init_method,
+            self.local_rank)
 
         # Device initialization should happen after initializing
         # the distributed runtime.
@@ -136,7 +145,9 @@ class TPUWorker:
             xr.initialize_cache(per_rank_path, readonly=False)
 
         # Init ModelRunner here, so that we have access to self.device.
-        self.model_runner = TPUModelRunner(self.vllm_config, self.device)
+        self.model_runner = \
+            TPUModelRunner(self.vllm_config, self.device,
+                           self.original_parallel_config)
 
         if rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -151,9 +162,7 @@ class TPUWorker:
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
                 # it by reference, rather by specializing on the value ``None``.
-                tpu_kv_cache = torch.tensor([],
-                                            dtype=dtype,
-                                            device=self.device)
+                tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
                 kv_caches[layer_name] = tpu_kv_cache
             else:
                 raise NotImplementedError(
@@ -166,7 +175,8 @@ class TPUWorker:
             runner_kv_caches)
 
         # `max_num_tokens >= max_num_batched_tokens` due to padding.
-        self.model_runner.profile_run(self.model_runner.max_num_tokens)
+        with self.model_runner.maybe_setup_dummy_loras(self.lora_config):
+            self.model_runner.profile_run(self.model_runner.max_num_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
@@ -181,9 +191,20 @@ class TPUWorker:
 
         # Get the maximum amount of memory used by the model weights and
         # intermediate activations.
-        m = xm.get_memory_info(self.device)
-        total_memory_size = m["bytes_limit"]
-        current_mem = m["bytes_used"]
+        if self.use_spmd:
+            # This is a workaround for the TPU SPMD mode. The get_memory_info
+            # API doesn't work with SPMD mode in PyTorch/XLA.
+            # TODO: use xm.get_memory_info for SPMD once it's supported in
+            # PyTorch/XLA.
+            import tpu_info
+            chip_type, _ = tpu_info.device.get_local_chips()
+            device_usage = tpu_info.metrics.get_chip_usage(chip_type)
+            total_memory_size = device_usage[0].total_memory
+            current_mem = device_usage[0].memory_usage
+        else:
+            m = xm.get_memory_info(self.device)
+            total_memory_size = m["bytes_limit"]
+            current_mem = m["bytes_used"]
         # Ideally we would use profiled = m["peak_bytes_used"] to
         # get weights + activations. But there is memory used during
         # compilation / weight loading that impacts the peak and
@@ -244,28 +265,30 @@ class TPUWorker:
         # worker will always be healthy as long as it's running.
         return
 
-
-def init_tpu_worker_distributed_environment(
-    parallel_config: ParallelConfig,
-    rank: int,
-    distributed_init_method: Optional[str] = None,
-    local_rank: int = -1,
-) -> None:
-    """Initialize the distributed environment."""
-
-    # NOTE(woosuk): This is just to initialize the TP group and broadcast
-    # the input objects on CPU. The all-reduce and all-gather ops on TPU
-    # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
-    # own context.
-    init_distributed_environment(
-        world_size=parallel_config.world_size,
-        rank=rank,
-        local_rank=local_rank,
-        distributed_init_method=distributed_init_method,
-        backend="gloo",
-    )
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    def _init_tpu_worker_distributed_environment(
+        self,
+        parallel_config: ParallelConfig,
+        rank: int,
+        distributed_init_method: Optional[str] = None,
+        local_rank: int = -1,
+    ) -> None:
+        """Initialize the distributed environment."""
+        if self.use_spmd:
+            xr.use_spmd()
+        # NOTE(woosuk): This is just to initialize the TP group and broadcast
+        # the input objects on CPU. The all-reduce and all-gather ops on TPU
+        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
+        # own context.
+        init_distributed_environment(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            local_rank=local_rank,
+            distributed_init_method=distributed_init_method,
+            backend="gloo",
+        )
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size)
 
 
 try:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 91548a52cfc708823a5ba94700a8423973639eb2..055cf01530f02714cdd462ad956102ec28fc89bf 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
 
+from vllm.v1.kv_cache_interface import KVCacheGroupSpec
+
 
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: object,
@@ -72,3 +75,37 @@ def gather_mm_placeholders(
         return placeholders
 
     return placeholders[is_embed]
+
+
+def initialize_kv_cache_for_kv_sharing(
+    shared_kv_cache_layers: dict[str, str],
+    kv_cache_groups: list[KVCacheGroupSpec],
+    kv_caches: dict[str, torch.Tensor],
+) -> None:
+    """
+    Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
+    for layers that do not allocate its own KV cache, based on the mapping in
+    `shared_kv_cache_layers`. Adds these layers to the corresponding KV cache
+    group, which is needed to ensure that attention metadata is assigned later.
+
+    Args:
+        shared_kv_cache_layers: Layer pairings for cross-layer KV sharing.
+            If an Attention layer `layer_name` is in the keys of this dict, it
+            means this layer will perform attention using the keys and values
+            from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        kv_cache_groups: The KV cache groups of the model.
+        kv_caches: The allocated kv_caches with layer names as keys.
+            Note that layers in shared_kv_cache_layers.keys() are not
+            originally included as it only contains layers which have its own
+            KV cache allocation.
+    """
+    # Record index of KV cache group for each layer that allocates a KV cache.
+    layer_to_kv_cache_group_idx: dict[str, int] = {}
+    for i, kv_cache_group in enumerate(kv_cache_groups):
+        for layer_name in kv_cache_group.layer_names:
+            layer_to_kv_cache_group_idx[layer_name] = i
+
+    for layer_name, target_layer_name in shared_kv_cache_layers.items():
+        kv_caches[layer_name] = kv_caches[target_layer_name]
+        group_idx = layer_to_kv_cache_group_idx[target_layer_name]
+        kv_cache_groups[group_idx].layer_names.append(layer_name)
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 487a49b6211e2b2ee1df7787bcddedd66f22ce4e..9c93754f93f812809a2565114997cf692dd9309e 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/version.py b/vllm/version.py
index 8329d7becb683d600d3456d8b704257d21cb6659..6c88b1b5a3bf4180bca6a93ce549e6ee9906ea81 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 try:
     from ._version import __version__, __version_tuple__
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index d48a6957c5ddafba0a6ca1298419c33887814ce5..530907012f7049da1196ae0f279bc80dcef9eab3 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 82eeeb570d2223ce04d8608f4b21f1888d4c287d..c99e2652a3972ff499ff508d32cb03b97014d433 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
@@ -299,7 +300,6 @@ class CPUEncoderDecoderModelRunner(
             model_input.encoder_input_positions,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
             "intermediate_tensors":
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index fb436a079f878e75d62127461bf1f74744f3361c..68cdf65cafa793534506c7ab9ad48cbb3e615e50 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import weakref
@@ -629,7 +630,6 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
         if model_input.multi_modal_kwargs is not None:
             multimodal_kwargs = MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
         execute_model_kwargs = {}
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 2a60e51261ad647d681c64a6eb38198cf57c9494..203fdf225a41a56b1bf48ff97b0e1b616562d5b8 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
@@ -52,7 +53,6 @@ class CPUPoolingModelRunner(
             model_input.input_positions,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
             **cross_enc_kwargs,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 1436a404335a0a8a9841da73f4ba1dae8275c660..9e834befd68abb43e1bf30ebe619a786741d6191 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
+from importlib import util
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
@@ -155,8 +157,10 @@ class CPUWorker(LocalOrDistributedWorkerBase):
 
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
-        if omp_cpuids == "all":
-            self.local_omp_cpuid = "all"
+        self.local_omp_cpuid = "all"
+        if omp_cpuids == "auto":
+            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+            )
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -398,3 +402,49 @@ class CPUWorker(LocalOrDistributedWorkerBase):
         return CPUCacheEngine.get_cache_block_size(
             self.cache_config.block_size, self.cache_config.cache_dtype,
             self.model_config, self.parallel_config)
+
+    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = self.local_omp_cpuid
+        # Setup OpenMP thread affinity based on NUMA nodes automatically
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpu_count = psutil.cpu_count(logical=False)
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+            cpu_count_per_numa = cpu_count // numa_size
+            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                      cpu_count_per_numa // 2)
+
+            # check allow node_to_cpus list
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(list(node_intersect))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_to_cpus[self.rank][:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("auto thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 3957e5608524fae7218dee14aada7199406568a5..8d92edc5b386edd3d83237bc95fd9885bd249387 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import itertools
@@ -204,7 +205,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     multi_modal_kwargs,
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
                 **seqlen_agnostic_kwargs,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index e2261cbb26b442100399a6150db7aa7fd27bb74e..17123d2b483754f6b4ee1a61ab0d1af772f0260e 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 533fead0e669edf08420a54f91a52ea9d5513083..6d76ea499a901695f7b55a468c37f6061d9cf8eb 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8c968faa78101978ada448882f52474c5a987500..82db6617ba55fe21491f3ca68b79ef915c1faf44 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import gc
@@ -1847,7 +1848,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(
                         multi_modal_kwargs,
-                        dtype=self.model_config.dtype,
                         device=self.device,
                     ),
                     **seqlen_agnostic_kwargs,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 935325cb2e1c07b3d879f3600062485e01395989..d567ce4a6e78fd4886ae66e50c909a1c4f3c226c 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from abc import ABC, abstractmethod
diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py
index 2c5e2eac75898ab09ae5509338e7d504aa769501..f0210c13c75538de79b9fbdc3c61db9ffdea108c 100644
--- a/vllm/worker/multi_step_hpu_worker.py
+++ b/vllm/worker/multi_step_hpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index f8d5acf586c51c0da11acf298af7e8dc87addfca..cc0cc855e7be44367cf182ee8a94b36cd02aeacb 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import functools
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
index aafb7ab7cfb8d4b2e4ec75285271404209012dec..25f588077cb42d08edea09dc1d20236de421eade 100644
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ b/vllm/worker/multi_step_neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from importlib.util import find_spec
 from typing import List, Optional
@@ -72,7 +73,6 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
             input_block_ids=model_input.input_block_ids,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
         )
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
index 3a9c0993e004fae69c4a0ceb7e485ed9211b0de1..dd521dd67dad0d57ad29939a4fd29de792c7c0d6 100644
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import List, Optional
 
 import torch
@@ -51,7 +52,6 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
             sampling_params=sampling_params,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
         )
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
index 3871199987cee1ee8bf2066272edd8688b7f562a..ed9f001666159eb095f306b887dab376c2a01c11 100644
--- a/vllm/worker/multi_step_tpu_worker.py
+++ b/vllm/worker/multi_step_tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Dict, Optional, Tuple
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index 3518ab2f64fed8301d984000d43344c92afaae6c..ea16e14f9ecd4c55a6323d06aa8aeac42cf3407f 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from dataclasses import dataclass
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 968596471a26ef6808d877bb87dc6ece4659fdb8..7ccf1a2c0a87665960f8154f97f3c83f423f3ce7 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuron import get_neuron_model
@@ -36,6 +39,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: SamplingMetadata = None
     multi_modal_kwargs: BatchedTensorInputs = None
+    adapter_ids: Optional[str] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -80,6 +84,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                            "The model will run without sliding window.")
         self.device_config = (self.device_config if self.device_config
                               is not None else DeviceConfig())
+        self.lora_config = vllm_config.lora_config
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
@@ -165,6 +170,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
             mm_kwargs = seq_group_metadata.multi_modal_data
             if mm_kwargs:
+                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
                 multi_modal_kwargs_list.append(mm_kwargs)
 
         max_seq_len = max(seq_lens)
@@ -270,6 +276,14 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 sampling_params.top_p = top_p
                 sampling_params.temperature = temperature
 
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
             seq_lens,
@@ -378,9 +392,9 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 positions=model_input.input_positions,
                 input_block_ids=model_input.input_block_ids,
                 sampling_params=sampling_params,
+                adapter_ids=model_input.adapter_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
@@ -393,7 +407,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                 input_block_ids=model_input.input_block_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
@@ -416,3 +429,32 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
+
+    def process_multi_modal_data_neuron(self, mm_data):
+        # this is a no-op for NeuronModelRunner
+        return mm_data
+
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def add_lora(self, lora_request: LoRARequest):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index aa8e39613eec835792236bee8906ee69bc593f07..662bde6bc07b03b5f0167def032627276a82e327 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A Neuron worker class."""
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch.distributed
 
@@ -9,19 +10,19 @@ from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.platforms.neuron import NeuronFramework
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoRANotSupportedWorkerBase, WorkerBase,
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
 logger = init_logger(__name__)
 
 
-class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+class NeuronWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of neuron cores.
     """
 
@@ -38,6 +39,7 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.is_driver_worker = is_driver_worker
+        self.lora_config = vllm_config.lora_config
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
@@ -59,6 +61,9 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
                 "[transformers-neuronx, neuronx-distributed-inference]")
 
     def get_tnx_model_runner(self, vllm_config):
+        assert (self.lora_config
+                is None), ("LoRA is not supported for TransformersNeuronX "
+                           "framework.")
         from vllm.worker.multi_step_neuron_model_runner import (
             MultiStepNeuronModelRunner)
         if self.speculative_config is not None:
@@ -72,6 +77,8 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         from vllm.worker.neuronx_distributed_model_runner import (
             NeuronxDistributedModelRunner)
         if self.speculative_config is not None:
+            assert (self.lora_config
+                    is None), "LoRA is not supported for Speculative Decoding"
             return MultiStepNeuronxDistributedModelRunner(
                 vllm_config=vllm_config)
         else:
@@ -156,3 +163,31 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
             1,
             1,
         )
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.list_loras()
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
index 4e784e5e0302d20a5ac13a031f9a6b6457a8442f..2a0f4e77c99e5c487f0b8457bcc9ce4f625f4945 100644
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ b/vllm/worker/neuronx_distributed_model_runner.py
@@ -1,17 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import List, Optional
+from typing import List, Optional, Set
 
 import torch
+from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import (
+    get_all_supported_aspect_ratios)
 from neuronx_distributed_inference.modules.generation.sampling import (
     prepare_sampling_params)
+from neuronx_distributed_inference.modules.lora_serving import (
+    LoraCheckpoint, LoraServingConfig)
 
 from vllm.config import VllmConfig
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuronx_distributed import (
     _get_model_architecture, get_neuron_model)
-from vllm.sequence import IntermediateTensors
+from vllm.multimodal import MultiModalKwargs
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
                                              NeuronModelRunner)
 
@@ -25,11 +35,44 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         vllm_config: VllmConfig,
     ):
         super().__init__(vllm_config)
+        self.lora_checkpoint = None
+        self.model = None
+        self.lora_serving_config = None
+
+    @staticmethod
+    def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]):
+        if not lora_modules:
+            return None
+        return {_.get("name"): _.get("path") for _ in lora_modules}
+
+    def _get_nxdi_lora_config(self):
+        override_neuron_config = self.model_config.override_neuron_config
+        lora_modules = override_neuron_config.pop("lora_modules", None)
+        target_modules = override_neuron_config.pop("target_modules", None)
+        lora_ckpt_paths = self._get_lora_paths_strings(lora_modules)
+        if self.lora_config.max_loras < len(lora_ckpt_paths):
+            raise ValueError(
+                "Number of LoRAs (%s) exceeds maximum "
+                "allowed (%s)", len(lora_ckpt_paths),
+                self.lora_config.max_loras)
+
+        return LoraServingConfig(
+            max_loras=self.lora_config.max_loras,
+            max_lora_rank=self.lora_config.max_lora_rank,
+            target_modules=target_modules,
+            lora_ckpt_paths=lora_ckpt_paths,
+        )
 
     def load_model(self) -> None:
-        self.model = get_neuron_model(self.model_config,
-                                      parallel_config=self.parallel_config,
-                                      scheduler_config=self.scheduler_config)
+        # Update LoRA config
+        if self.lora_config is not None:
+            self.lora_serving_config = self._get_nxdi_lora_config()
+            self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config)
+        self.model = get_neuron_model(
+            self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            lora_serving_config=self.lora_serving_config)
 
     def get_nxd_sampling_params(self, sampling_metadata):
         if self.model.config.neuron_config.on_device_sampling_config:
@@ -81,42 +124,28 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         sampling_params = self.get_nxd_sampling_params(
             model_input.sampling_metadata)
 
-        if model_input.multi_modal_kwargs.get('image') is not None:
-            pixel_values = []
-            aspect_ratios = []
-            num_chunks = []
-            has_image = []
-            for multi_modal_input in model_input.multi_modal_kwargs.get(
-                    'image'):
-                image_tensors = self.get_multi_modal_data_neuron(
-                    multi_modal_input.squeeze(0))
-                pixel_values.append(image_tensors[0])
-                aspect_ratios.append(image_tensors[1])
-                num_chunks.append(image_tensors[2])
-                has_image.append(image_tensors[3])
-
-            pixel_values = torch.cat(pixel_values, dim=0)
-            aspect_ratios = torch.cat(aspect_ratios, dim=0)
-            num_chunks = torch.cat(num_chunks, dim=0)
-            has_image = torch.cat(has_image, dim=0)
-
+        if model_input.multi_modal_kwargs.get('pixel_values') is not None:
             hidden_states = self.model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
                 seq_ids=model_input.input_block_ids,
-                pixel_values=pixel_values,
-                aspect_ratios=aspect_ratios,
+                pixel_values=model_input.multi_modal_kwargs.get(
+                    'pixel_values'),
+                aspect_ratios=model_input.multi_modal_kwargs.get(
+                    'aspect_ratios'),
                 sampling_params=sampling_params,
-                num_chunks=num_chunks,
-                has_image=has_image,
+                num_chunks=model_input.multi_modal_kwargs.get('num_chunks'),
+                has_image=model_input.multi_modal_kwargs.get(
+                    'has_image').squeeze(1),
             )
         else:
-            empty_pixel_values = torch.zeros([1, 1, 4, 3, 560, 560],
+            bs = model_input.input_tokens.shape[0] if (model_input.input_tokens
+                                                       is not None) else 1
+            empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560],
                                              dtype=torch.bfloat16)
-            empty_aspect_ratios = torch.ones([1, 1, 2], dtype=torch.int64)
-            num_chunks = torch.tensor([[1]
-                                       ])  # dummy num_chunks, will not be used
-            has_image = torch.tensor([0])
+            empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64)
+            num_chunks = torch.zeros((bs, 1), dtype=torch.int32)
+            has_image = torch.zeros([bs], dtype=torch.int32)
             hidden_states = self.model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
@@ -134,3 +163,132 @@ class NeuronxDistributedModelRunner(NeuronModelRunner):
         )
 
         return [output]
+
+    def process_multi_modal_data_neuron(self, mm_data):
+        # Neuron uses aspect_ratios instead of aspect_ratio_ids
+        all_supported_aspect_ratios = get_all_supported_aspect_ratios(
+            self.model.config.vision_config.max_num_tiles)
+        aspect_ratio_ids = mm_data.get("aspect_ratio_ids")
+        mm_data["aspect_ratios"] = torch.tensor(
+            all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0)
+
+        # Neuron's num_chunks is HF's num_tiles
+        mm_data["num_chunks"] = mm_data.get("num_tiles")
+
+        # Input has an image if it has pixel_values
+        bs = mm_data["num_chunks"].shape[0]
+        pixel_values = mm_data.get("pixel_values")
+        if pixel_values is not None and not torch.all(pixel_values == 0):
+            mm_data["has_image"] = torch.ones(bs)
+
+        else:
+            mm_data["has_image"] = torch.zeros(bs)
+        return mm_data
+
+    def _get_lora_adapter_ids(self, seq_group_metadata_list):
+        # set LoRA adapter IDs for multi-lora serving
+        batch_size = len(seq_group_metadata_list)
+        if self.lora_checkpoint is not None:
+            # "0" indicates NxDI to use the base model for inference
+            adapter_ids = ["0"] * batch_size
+            for idx, seq_group_metadata in enumerate(seq_group_metadata_list):
+                if seq_group_metadata.lora_request is not None:
+                    adapter_ids[
+                        idx] = seq_group_metadata.lora_request.lora_name
+
+            # convert adapter_ids from strings to integers
+            adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices(
+                adapter_ids, batch_size)
+        else:
+            adapter_ids = torch.zeros((batch_size), dtype=torch.int32)
+
+        return adapter_ids
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForNeuron:
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            (input_tokens, input_positions,
+             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = None
+
+        if not self._on_device_sampling_disabled:
+            for seq_group_metadata in seq_group_metadata_list:
+                sampling_params = seq_group_metadata.sampling_params
+                top_k, top_p, temperature = (
+                    self._convert_to_neuron_sampling_params(sampling_params))
+                sampling_params.top_k = top_k
+                sampling_params.top_p = top_p
+                sampling_params.temperature = temperature
+
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
+        lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since neuron worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
+
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs,
+                                   adapter_ids=lora_adapter_ids)
+
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def add_lora(self, lora_request: LoRARequest):
+        logger.warning(
+            "Adding LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config. If you supplied "
+            "the parameter, you can ignore this warning. Ignoring"
+            "lora request: ", lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "Managing LoRAs is only supported through the "
+            "lora_modules parameter in override_neuron_config")
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 912e04c435f54ae14cc2e2b6b6f87a1d78289105..f80955f71a5a3d6662ddf138f860efcc190a3b9e 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
@@ -121,7 +122,6 @@ class PoolingModelRunner(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     multi_modal_kwargs,
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
                 **cross_enc_kwargs,
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index e0cca9072745864f8d77dccdf3472dde03394e1a..5f1535271b9ac4e0626e7b23a33021235bb44aeb 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 4bb9bea022f9934006b498b3d4a45988dba09397..ad5ed19e2f894546ca08d743de2f7fef6507d672 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import List, Optional, Tuple, Union
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index e2854bcb37cefd72b4d32ba0b48bb86b5c36bdfa..1a5f62cb3c47141a4506e9d447136e8647e1ceb9 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Worker-related helper functions.
 '''
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 6e45b8423e5ec243f4e5d7d72cc9f6525b3b7384..9a928632688a177ea33adb83f0f9e58be8237a3c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 import gc
 import os
@@ -128,6 +129,8 @@ class Worker(LocalOrDistributedWorkerBase):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
+        print(
+            self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e5662e69343c6f5600c2ff96b12a39431c4b7bee..0b37caa71669c4e3e9779aa5a8b08850b22f4047 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import os
@@ -596,7 +597,8 @@ class WorkerWrapperBase:
 
     def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
         kv_cache_config = kv_cache_configs[self.rpc_rank]
-        self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+        with set_current_vllm_config(self.vllm_config):
+            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
     def init_device(self):
         with set_current_vllm_config(self.vllm_config):
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 79fa7d2c73e8896c4ed837fd7e7e8ae4e1b8c341..b2d3ce8526d51231756cedeabf48bd10c541d866 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import time
@@ -564,7 +565,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index a5109a982cbfe6e95a739db4eb0a7aae0f1732d0..fe321c059f526bacd96fbc84740cdcda8537b930 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A XPU worker class."""
 import gc
 import os